diff --git a/README.md b/README.md new file mode 100644 index 0000000..983305d --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# ovos-date-parser + +`ovos-date-parser` is a comprehensive library for multilingual date and time parsing, extraction, and formatting, +designed to handle a range of human-readable date, time, and duration expressions. + +## Features + +- **Date and Time Extraction**: Extract specific dates and times from natural language phrases in various languages. +- **Duration Parsing**: Parse phrases that indicate a span of time, such as "two hours and fifteen minutes." +- **Friendly Time Formatting**: Format time for human-friendly output, supporting both 12-hour and 24-hour formats. +- **Relative Time Descriptions**: Generate relative descriptions (e.g., "tomorrow," "in three days") for given dates. +- **Multilingual Support**: Includes extraction and formatting methods for multiple languages, such as English, Spanish, + French, German, and more. + +## Installation + +```bash +pip install ovos-date-parser +``` + +## Usage + +### Date and Time Extraction + +Extract specific dates and times from a phrase. This function identifies date-related terms in natural language and +returns both the datetime object and any remaining text. + +```python +from ovos_date_parser import extract_datetime + +result = extract_datetime("Meet me next Friday at 3pm", lang="en") +print(result) # (datetime object, "at 3pm") +``` + +### Duration Extraction + +Identify duration phrases in text and convert them into a `timedelta` object. This can parse common human-friendly +duration expressions like "30 minutes" or "two and a half hours." + +```python +from ovos_date_parser import extract_duration + +duration, remainder = extract_duration("It will take about 2 hours and 30 minutes", lang="en") +print(duration) # timedelta object +print(remainder) # "about" +``` + +### Formatting Time + +Generate a natural-sounding time format suitable for voice or display in different languages, allowing customization for +speech or written text. + +```python +from ovos_date_parser import nice_time +from datetime import datetime + +dt = datetime.now() +formatted_time = nice_time(dt, lang="en", speech=True, use_24hour=False) +print(formatted_time) # "three o'clock" +``` + +### Relative Time Descriptions + +Create relative phrases for describing dates and times in relation to the current moment or a reference datetime. + +```python +from ovos_date_parser import nice_relative_time +from datetime import datetime, timedelta + +relative_time = nice_relative_time(datetime.now() + timedelta(days=1), datetime.now(), lang="en") +print(relative_time) # "tomorrow" +``` + +### Languages Supported + +`ovos-date-parser` supports a wide array of languages, each with its own set of methods for handling natural language +time expressions. Available methods include `extract_datetime`, `extract_duration`, `nice_time`, and `nice_duration` for +the following languages: + +| Language | `nice_time` | `nice_relative_time` | `nice_duration` | `extract_duration` | `extract_datetime` | +|----------|-------------|----------------------|-----------------|--------------------|--------------------| +| az | ✅ | ❌ | ✅ | ✅ | ✅ | +| ca | ✅ | ❌ | ❌ | ❌ | ✅ | +| cs | ✅ | ❌ | ❌ | ✅ | ✅ | +| da | ✅ | ❌ | ❌ | ❌ | ✅ | +| de | ✅ | ❌ | ❌ | ✅ | ✅ | +| en | ✅ | ❌ | ❌ | ✅ | ✅ | +| es | ✅ | ❌ | ❌ | ✅ | ✅ | +| eu | ✅ | ✅ | ❌ | ❌ | ✅ | +| fa | ✅ | ❌ | ❌ | ✅ | ✅ | +| fr | ✅ | ❌ | ❌ | ❌ | ✅ | +| hu | ✅ | ❌ | ❌ | ❌ | ❌ | +| it | ✅ | ❌ | ❌ | ❌ | ✅ | +| nl | ✅ | ❌ | ❌ | ✅ | ✅ | +| pl | ✅ | ❌ | ✅ | ✅ | ✅ | +| pt | ✅ | ❌ | ❌ | ✅ | ✅ | +| ru | ✅ | ❌ | ✅ | ✅ | ✅ | +| sv | ✅ | ❌ | ❌ | ✅ | ✅ | +| uk | ✅ | ❌ | ✅ | ✅ | ✅ | + +## License + +This project is licensed under the Apache 2.0 License \ No newline at end of file diff --git a/ovos_date_parser/__init__.py b/ovos_date_parser/__init__.py new file mode 100644 index 0000000..7b99d81 --- /dev/null +++ b/ovos_date_parser/__init__.py @@ -0,0 +1,288 @@ +from datetime import datetime, timedelta, time +from typing import Optional, Tuple, Union + +from ovos_date_parser.dates_az import ( + extract_datetime_az, + extract_duration_az, + nice_duration_az, + nice_time_az, +) +from ovos_date_parser.dates_ca import ( + TimeVariantCA, + extract_datetime_ca, + nice_time_ca, +) +from ovos_date_parser.dates_cs import ( + extract_duration_cs, + extract_datetime_cs, + nice_time_cs, +) +from ovos_date_parser.dates_da import ( + extract_datetime_da, + nice_time_da, +) +from ovos_date_parser.dates_de import ( + extract_datetime_de, + extract_duration_de, + nice_time_de, +) +from ovos_date_parser.dates_en import ( + extract_datetime_en, + extract_duration_en, + nice_time_en, +) +from ovos_date_parser.dates_es import ( + extract_datetime_es, + extract_duration_es, + nice_time_es, +) +from ovos_date_parser.dates_eu import ( + extract_datetime_eu, + nice_time_eu, + nice_relative_time_eu, +) +from ovos_date_parser.dates_fa import ( + extract_datetime_fa, + nice_time_fa, + extract_duration_fa, +) +from ovos_date_parser.dates_fr import ( + extract_datetime_fr, + nice_time_fr +) +from ovos_date_parser.dates_hu import nice_time_hu +from ovos_date_parser.dates_it import ( + extract_datetime_it, + nice_time_it +) +from ovos_date_parser.dates_nl import ( + extract_datetime_nl, + nice_part_of_day_nl, + extract_duration_nl, + nice_time_nl +) +from ovos_date_parser.dates_pl import ( + extract_datetime_pl, + extract_duration_pl, + nice_time_pl, + nice_duration_pl +) +from ovos_date_parser.dates_pt import ( + extract_datetime_pt, + extract_duration_pt, + nice_time_pt +) +from ovos_date_parser.dates_ru import ( + extract_datetime_ru, + extract_duration_ru, + nice_time_ru, + nice_duration_ru +) +from ovos_date_parser.dates_sv import ( + extract_datetime_sv, + extract_duration_sv, + nice_time_sv +) +from ovos_date_parser.dates_uk import ( + extract_datetime_uk, + extract_duration_uk, + nice_time_uk, +nice_duration_uk +) + + +def nice_time( + dt: datetime, + lang: str, + speech: bool = True, + use_24hour: bool = False, + use_ampm: bool = False, + variant: Optional[TimeVariantCA] = None, +) -> str: + """ + Format a time to a comfortable human format. + + Args: + dt: date to format (assumes already in local timezone). + lang: A BCP-47 language code. + speech: Format for speech (default is True) or display (False). + use_24hour: Output in 24-hour/military or 12-hour format. + use_ampm: Include the am/pm for 12-hour format. + variant: Optional variant for Catalan (ca). + + Returns: + The formatted time string. + """ + if lang.startswith("az"): + return nice_time_az(dt, speech, use_24hour, use_ampm) + if lang.startswith("ca"): + return nice_time_ca(dt, speech, use_24hour, use_ampm, variant=variant) + if lang.startswith("cs"): + return nice_time_cs(dt, speech, use_24hour, use_ampm) + if lang.startswith("da"): + return nice_time_da(dt, speech, use_24hour, use_ampm) + if lang.startswith("de"): + return nice_time_de(dt, speech, use_24hour, use_ampm) + if lang.startswith("en"): + return nice_time_en(dt, speech, use_24hour, use_ampm) + if lang.startswith("es"): + return nice_time_es(dt, speech, use_24hour, use_ampm) + if lang.startswith("eu"): + return nice_time_eu(dt, speech, use_24hour, use_ampm) + if lang.startswith("fa"): + return nice_time_fa(dt, speech, use_24hour, use_ampm) + if lang.startswith("fr"): + return nice_time_fr(dt, speech, use_24hour, use_ampm) + if lang.startswith("hu"): + return nice_time_hu(dt, speech, use_24hour, use_ampm) + if lang.startswith("it"): + return nice_time_it(dt, speech, use_24hour, use_ampm) + if lang.startswith("nl"): + return nice_time_nl(dt, speech, use_24hour, use_ampm) + if lang.startswith("pl"): + return nice_time_pl(dt, speech, use_24hour, use_ampm) + if lang.startswith("pt"): + return nice_time_pt(dt, speech, use_24hour, use_ampm) + if lang.startswith("ru"): + return nice_time_ru(dt, speech, use_24hour, use_ampm) + if lang.startswith("sv"): + return nice_time_sv(dt, speech, use_24hour, use_ampm) + if lang.startswith("uk"): + return nice_time_uk(dt, speech, use_24hour, use_ampm) + raise NotImplementedError(f"Unsupported language: {lang}") + + +def nice_relative_time(when, relative_to, lang): + """Create a relative phrase to roughly describe a datetime + + Examples are "25 seconds", "tomorrow", "7 days". + + Args: + when (datetime): Local timezone + relative_to (datetime): Baseline for relative time, default is now() + lang (str, optional): Defaults to "en-us". + Returns: + str: Relative description of the given time + """ + if lang.startswith("eu"): + return nice_relative_time_eu(when, relative_to) + raise NotImplementedError(f"Unsupported language: {lang}") + + +def nice_duration( + duration: Union[int, float], lang: str, speech: bool = True +) -> str: + """ + Convert duration in seconds to a nice spoken timespan. + + Args: + duration: Time in seconds. + lang: A BCP-47 language code. + speech: Format for speech (True) or display (False). + + Returns: + Timespan as a string. + """ + if lang.startswith("az"): + return nice_duration_az(duration, speech) + if lang.startswith("pl"): + return nice_duration_pl(duration, speech) + if lang.startswith("ru"): + return nice_duration_ru(duration, speech) + if lang.startswith("uk"): + return nice_duration_uk(duration, speech) + raise NotImplementedError(f"Unsupported language: {lang}") + + +def extract_duration( + text: str, lang: str +) -> Tuple[Optional[timedelta], str]: + """ + Convert a phrase into a number of seconds and return the remainder text. + + Args: + text: String containing a duration. + lang: A BCP-47 language code. + + Returns: + A tuple containing the duration as timedelta and the remaining text. + """ + if lang.startswith("az"): + return extract_duration_az(text) + if lang.startswith("cs"): + return extract_duration_cs(text) + if lang.startswith("de"): + return extract_duration_de(text) + if lang.startswith("en"): + return extract_duration_en(text) + if lang.startswith("es"): + return extract_duration_es(text) + if lang.startswith("fa"): + return extract_duration_fa(text) + if lang.startswith("nl"): + return extract_duration_nl(text) + if lang.startswith("pl"): + return extract_duration_pl(text) + if lang.startswith("pt"): + return extract_duration_pt(text) + if lang.startswith("ru"): + return extract_duration_ru(text) + if lang.startswith("sv"): + return extract_duration_sv(text) + if lang.startswith("uk"): + return extract_duration_uk(text) + raise NotImplementedError(f"Unsupported language: {lang}") + + +def extract_datetime( + text: str, + lang: str, + anchorDate: Optional[datetime] = None, + default_time: Optional[time] = None, +) -> Optional[Tuple[datetime, str]]: + """ + Extract date and time information from a sentence. + + Args: + text: The text to be interpreted. + lang: The BCP-47 code for the language to use. + anchorDate: Date to use for relative dating. + default_time: Time to use if none was found in the input string. + + Returns: + A tuple with the extracted date as datetime and the leftover string, + or None if no date or time related text is found. + """ + if lang.startswith("az"): + return extract_datetime_az(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("ca"): + return extract_datetime_ca(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("cs"): + return extract_datetime_cs(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("da"): + return extract_datetime_da(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("de"): + return extract_datetime_de(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("en"): + return extract_datetime_en(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("es"): + return extract_datetime_es(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("fa"): + return extract_datetime_fa(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("fr"): + return extract_datetime_fr(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("it"): + return extract_datetime_it(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("nl"): + return extract_datetime_nl(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("pl"): + return extract_datetime_pl(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("pt"): + return extract_datetime_pl(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("ru"): + return extract_datetime_ru(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("sv"): + return extract_datetime_sv(text, anchorDate=anchorDate, default_time=default_time) + if lang.startswith("uk"): + return extract_datetime_uk(text, anchorDate=anchorDate, default_time=default_time) + raise NotImplementedError(f"Unsupported language: {lang}") diff --git a/ovos_date_parser/dates_az.py b/ovos_date_parser/dates_az.py new file mode 100644 index 0000000..6fd7cb4 --- /dev/null +++ b/ovos_date_parser/dates_az.py @@ -0,0 +1,840 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_az import pronounce_number_az, extract_number_az, _convert_words_to_numbers_az +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + +_HARD_VOWELS = ['a', 'ı', 'o', 'u'] +_SOFT_VOWELS = ['e', 'ə', 'i', 'ö', 'ü'] +_VOWELS = _HARD_VOWELS + _SOFT_VOWELS + + +def _get_full_time_ak(hour): + if hour in [1, 3, 4, 5, 8, 11]: + return "ə" + if hour in [2, 7, 12]: + return "yə" + if hour in [9, 10]: + return "a" + return "ya" + + +def _get_half_time_ak(hour): + if hour in [1, 5, 8, 11]: + return "in" + if hour in [2, 7, 12]: + return "nin" + if hour in [3, 4]: + return "ün" + if hour in [9, 10]: + return "un" + return "nın" + + +def _get_daytime(hour): + if hour < 6: + return "gecə" + if hour < 12: + return "səhər" + if hour < 18: + return "gündüz" + return "axşam" + + +def _get_last_vowel(word): + is_last = True + for char in word[::-1]: + if char in _VOWELS: + return char, is_last + is_last = False + + return "", is_last + + +def _last_vowel_type(word): + return _get_last_vowel(word)[0] in _HARD_VOWELS + + +def _generate_plurals_az(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In Azerbaijani this means appending 'lar' or 'lər' to them according to the last vowel in word. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + + if isinstance(originals, dict): + return {key + ('lar' if _last_vowel_type(key) else 'lər'): value for key, value in originals.items()} + return {value + ('lar' if _last_vowel_type(value) else 'lər') for value in originals} + + +def nice_time_az(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'altının yarısı' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + string = _get_daytime(dt.hour) + " " + string + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8" or "13" + if string[0] == '0': + speak += pronounce_number_az(int(string[0])) + " " + speak += pronounce_number_az(int(string[1])) + else: + speak = pronounce_number_az(int(string[0:2])) + + speak += " " + if string[3] == '0': + speak += pronounce_number_az(0) + " " + speak += pronounce_number_az(int(string[4])) + else: + speak += pronounce_number_az(int(string[3:5])) + return speak + else: + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + next_hour = (dt.hour + 1) % 12 or 12 + speak = "" + if use_ampm: + speak += _get_daytime(dt.hour) + " " + + if dt.minute == 0: + speak += "{} tamamdır".format(pronounce_number_az(hour)) + elif dt.minute < 30: + speak += "{}{} {} dəqiqə işləyib".format(pronounce_number_az(next_hour), _get_full_time_ak(next_hour), + pronounce_number_az(dt.minute)) + elif dt.minute == 30: + speak += "{}{} yarısı".format(pronounce_number_az(next_hour), _get_half_time_ak(next_hour)) + else: + speak += "{}{} {} dəqiqə qalıb".format(pronounce_number_az(next_hour), _get_full_time_ak(next_hour), + pronounce_number_az(dt.minute - 30)) + + return speak + + +def nice_duration_az(duration, speech=True): + """ Convert duration in seconds to a nice spoken timespan + + Examples: + duration = 60 -> "1:00" or "bir dəqiqə" + duration = 163 -> "2:43" or "iki deqiqe qırx üç saniyə" + + Args: + duration: time, in seconds + speech (bool): format for speech (True) or display (False) + + Returns: + str: timespan as a string + """ + + if isinstance(duration, timedelta): + duration = duration.total_seconds() + + # Do traditional rounding: 2.5->3, 3.5->4, plus this + # helps in a few cases of where calculations generate + # times like 2:59:59.9 instead of 3:00. + duration += 0.5 + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + if speech: + out = "" + if days > 0: + out += pronounce_number_az(days) + " " + out += "gün" + if hours > 0: + if out: + out += " " + out += pronounce_number_az(hours) + " " + out += "saat" + if minutes > 0: + if out: + out += " " + out += pronounce_number_az(minutes) + " " + out += "dəqiqə" + if seconds > 0: + if out: + out += " " + out += pronounce_number_az(seconds) + " " + out += "saniyə" + else: + # M:SS, MM:SS, H:MM:SS, Dd H:MM:SS format + out = "" + if days > 0: + out = str(days) + "g " + if hours > 0 or days > 0: + out += str(hours) + ":" + if minutes < 10 and (hours > 0 or days > 0): + out += "0" + out += str(minutes) + ":" + if seconds < 10: + out += "0" + out += str(seconds) + + return out + + +def extract_duration_az(text): + """ + Convert an azerbaijani phrase into a number of seconds + + Convert things like: + "10 dəqiqə" + "2 yarım saat" + "3 gün 8 saat 10 dəqiqə 49 saniyə" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "5 dəqiqəyə taymer qur" would return + (300, "taymer qur"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + time_units_az = { + 'mikrosaniyə': 'microseconds', + 'milisaniyə': 'milliseconds', + 'saniyə': 'seconds', + 'dəqiqə': 'minutes', + 'saat': 'hours', + 'gün': 'days', + 'həftə': 'weeks' + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}?(?:yə|a|ə)?(?:(?:\s|,)+)?(?Pyarım|0\.5)?(?:a)?" + text = _convert_words_to_numbers_az(text) + for unit_az in time_units_az: + unit_pattern = pattern.format(unit=unit_az) + + def repl(match): + time_units[time_units_az[unit_az]] += float(match.group(1)) + (0.5 if match.group(2) else 0) + return '' + + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_az(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "bu gün" + "sabah günortadan sonra" + "gələn çərşənbə axşamı günorta 4 də" + "3 avqust" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "çərşənbə axşamı hava necədir" + returns the date for the forthcoming çərşənbə axşamı relative to the reference + date and the remainder string + "hava necədir". + + The "gələn" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "gələn Bazar ertəsi" would be in 3 days. + On Saturday, "gələn Bazar ertəsi" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "sabah", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s, word_list): + # normalize and lowercase utt (replaces words with numbers) + s = _convert_words_to_numbers_az(s, ordinals=None) + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') + + wordList = s.split() + skip_next_word = False + new_words = [] + for idx, word in enumerate(wordList): + if skip_next_word: + skip_next_word = False + continue + wordNext = wordList[idx + 1] if idx + 1 < len(wordList) else "" + ordinals = ["ci", "cü", "cı", "cu"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in wordNext: + skip_next_word = True + if ((word == "bu" and wordNext == "gün") or + (word in ['cümə', 'çərşənbə'] and 'axşamı ' in wordNext) or + (word == 'bazar' and 'ertəsi' in wordNext) or + (word == 'günortadan' and wordNext == 'sonra') or + (word == 'gecə' and 'yarısı' in wordNext)): + word = word + ' ' + wordNext + skip_next_word = True + + for orig_word in word_list: + if word.startswith(orig_word): + word = orig_word + break + + new_words.append(word) + + return new_words + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if not anchorDate: + anchorDate = now_local() + + if text == "": + return None + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + word_list = [] + timeQualifiersAM = ['səhər', 'gecə'] + timeQualifiersPM = ['günorta', 'axşam', 'nahar'] + word_list += timeQualifiersAM + timeQualifiersPM + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['da', 'də', 'sonra', "ərzində", "günündən", "günü", "gündən", "gün"] + days = ['bazar ertəsi', 'çərşənbə axşamı', 'çərşənbə', + 'cümə axşamı', 'cümə', 'şənbə', 'bazar'] + months = ['yanvar', 'fevral', 'mart', 'aprel', 'may', 'iyun', + 'iyul', 'avqust', 'sentyabr', 'oktyabr', 'moyabr', + 'dekabr'] + eng_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + word_list += days + months + recur_markers = days + [_generate_plurals_az(d) for d in days] + ['həftə sonu', 'iş günü', + 'həftə sonları', 'iş günləri'] + monthsShort = ['yan', 'fev', 'mar', 'apr', 'may', 'ıyn', 'ıyl', 'avq', + 'sen', 'okt', 'noy', 'dek'] + year_multiples = ["onillik", "yüzillik", "minillik"] + day_multiples = ["həftə", "ay", "il"] + word_list += year_multiples + day_multiples + ['saat', 'dəqiqə', 'saniyə', 'sonra', 'gecə yarısı', + 'günortadan sonra', 'gün'] + word_list.sort(key=lambda x: len(x), reverse=True) + words = clean_string(text, word_list) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word == "indi" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_az(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if "onillik" in wordNext: + yearOffset = multiplier * 10 + elif "yüzillik" in wordNext: + yearOffset = multiplier * 100 + elif "minillik" in wordNext: + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse bu qün, sabah, srağagün, dünən, birigün + elif word == "bu gün" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "sabah" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "srağagün" and not fromFlag: + dayOffset = -2 + used += 1 + elif word == "dünən" and not fromFlag: + dayOffset = -1 + used += 1 + elif word == "birigün" and not fromFlag: + dayOffset = 2 + used = 1 + # parse 5 gün, 10 həftə, keçən həftə, gələn həftə + elif word == "gün": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif word == "həftə" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif wordPrev == "gələn": + dayOffset = 7 + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif wordPrev == "keçən": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "ay" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "gələn": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "keçən": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 il, gələn il, keçən il + elif word == "il" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "gələn": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "keçən": + yearOffset = -1 + start -= 1 + used = 2 + if wordNext in markers: + used += 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + if wordNext in markers: + used += 1 + d = days.index(word) + dayOffset = (d + 1) - int(today) + used += 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "gələn": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "keçən": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = eng_months[m] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + if (wordNextNext and wordNextNext in markers) or wordNextNext == 'il': + used += 1 + else: + if wordNext and wordNext in markers: + used += 1 + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + if wordNextNextNext and wordNextNextNext in markers: + used += 1 + else: + if wordNextNext and wordNextNext in markers: + used += 1 + hasYear = False + + elif word == "bu": + used += 1 + dayOffset = 0 + if wordNext in markers: + used += 1 + + if used > 0: + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse günorta, gecə yarısı, səhər, günortadan sonra, axşam, gecə + used = 0 + if word == "günorta": + hrAbs = 12 + used += 1 + elif word == "gecə yarısı": + hrAbs = 0 + used += 1 + elif word == "səhər": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "günortadan sonra": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "axşam": + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "gecə": + if hrAbs is None: + hrAbs = 21 + used += 1 + # parse yarım saat + elif word == "saat": + if wordPrev == "yarım": + minOffset = 30 + if wordNext in markers: + used += 1 + + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if ':' in word: + # parse colons + # "gecə 3:00" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + ("saat" in wordNext or "saat" in remainder) and + word[0] != '0' and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # "3 saat" + hrOffset = int(strNum) + used = 1 + isTime = False + hrAbs = -1 + minAbs = -1 + elif "dəqiqə" in wordNext or "dəqiqə" in wordNext: + # "10 dəqiqə" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + if wordNextNext in markers: + used += 1 + elif "saniyə" in wordNext or "saniyə" in remainder: + # 5 saniyə + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext and wordNextNext == "da" or + wordNextNext == "də" or + remainder == "da" or remainder == "də"): + used += 1 + elif wordNext in markers: + strHH = strNum + + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if wordNext in markers or word in markers: + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "iyun 5" or "iyun 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "və" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_ca.py b/ovos_date_parser/dates_ca.py new file mode 100644 index 0000000..30d7a23 --- /dev/null +++ b/ovos_date_parser/dates_ca.py @@ -0,0 +1,1300 @@ +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from ovos_utils.time import now_local +from enum import IntEnum +from ovos_number_parser.numbers_ca import pronounce_number_ca + + +class TimeVariantCA(IntEnum): + DEFAULT = 0 + BELL = 1 + FULL_BELL = 2 + SPANISH_LIKE = 3 + + +def nice_time_ca(dt, speech=True, use_24hour=False, use_ampm=False, + variant=TimeVariantCA.DEFAULT): + """ + Format a time to a comfortable human format + For example, generate 'cinc trenta' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + variant = variant or TimeVariantCA.DEFAULT + + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if variant == TimeVariantCA.BELL: + # Bell Catalan Time System + # https://en.wikipedia.org/wiki/Catalan_time_system + + if dt.minute < 7: + next_hour = False + elif dt.minute == 7 or dt.minute == 8: + speak += "mig quart" + next_hour = True + elif dt.minute < 15: + next_hour = False + elif dt.minute == 15: + speak += "un quart" + next_hour = True + elif dt.minute == 16: + speak += "un quart i un minut" + next_hour = True + elif dt.minute < 21: + speak += "un quart i " + pronounce_number_ca( + dt.minute - 15) + " minuts" + next_hour = True + elif dt.minute == 22 or dt.minute == 23: + speak += "un quart i mig" + next_hour = True + elif dt.minute < 30: + speak += "un quart i " + pronounce_number_ca( + dt.minute - 15) + " minuts" + next_hour = True + elif dt.minute == 30: + speak += "dos quarts" + next_hour = True + elif dt.minute == 31: + speak += "dos quarts i un minut" + next_hour = True + elif dt.minute < 37: + speak += "dos quarts i " + pronounce_number_ca( + dt.minute - 30) + " minuts" + next_hour = True + elif dt.minute == 37 or dt.minute == 38: + speak += "dos quarts i mig" + next_hour = True + elif dt.minute < 45: + speak += "dos quarts i " + pronounce_number_ca( + dt.minute - 30) + " minuts" + next_hour = True + elif dt.minute == 45: + speak += "tres quarts" + next_hour = True + elif dt.minute == 46: + speak += "tres quarts i un minut" + next_hour = True + elif dt.minute < 52: + speak += "tres quarts i " + pronounce_number_ca( + dt.minute - 45) + " minuts" + next_hour = True + elif dt.minute == 52 or dt.minute == 53: + speak += "tres quarts i mig" + next_hour = True + elif dt.minute > 53: + speak += "tres quarts i " + pronounce_number_ca( + dt.minute - 45) + " minuts" + next_hour = True + + if next_hour == True: + next_hour = (dt.hour + 1) % 12 + if next_hour == 0: + speak += " de dotze" + if dt.hour == 11: + speak += " del migdia" + else: + speak += " de la nit" + + elif next_hour == 1: + speak += " d'una" + if dt.hour == 12: + speak += " de la tarda" + else: + speak += " de la matinada" + elif next_hour == 2: + speak += " de dues" + if dt.hour == 13: + speak += " de la tarda" + else: + speak += " de la nit" + + elif next_hour == 11: + speak += " d'onze" + if dt.hour == 22: + speak += " de la nit" + else: + speak += " del matí" + else: + speak += " de " + pronounce_number_ca(next_hour) + if dt.hour == 0 and dt.hour < 5: + speak += " de la matinada" + elif dt.hour >= 5 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 11: + speak += " del migdia" + elif dt.hour >= 12 and dt.hour <= 17: + speak += " de la tarda" + elif dt.hour >= 18 and dt.hour < 20: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + + else: + hour = dt.hour % 12 + if hour == 0: + speak += "les dotze" + elif hour == 1: + speak += "la una" + elif hour == 2: + speak += "les dues" + else: + speak += "les " + pronounce_number_ca(hour) + + if dt.minute == 0: + speak += " en punt" + elif dt.minute == 1: + speak += " i un minut" + else: + speak += " i " + pronounce_number_ca(dt.minute) + " minuts" + + if dt.hour == 0: + speak += " de la nit" + elif dt.hour >= 1 and dt.hour < 6: + speak += " de la matinada" + elif dt.hour >= 6 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 12: + speak += " del migdia" + elif dt.hour >= 13 and dt.hour < 19: + speak += " de la tarda" + elif dt.hour >= 19 and dt.hour < 21: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + elif variant == TimeVariantCA.FULL_BELL: + # Full Bell Catalan Time System + # https://en.wikipedia.org/wiki/Catalan_time_system + + if dt.minute < 2: + # en punt + next_hour = False + if dt.minute < 5: + # tocades + next_hour = False + elif dt.minute < 7: + # ben tocades + next_hour = False + elif dt.minute < 9: + # mig quart + speak += "mig quart" + next_hour = True + elif dt.minute < 12: + # mig quart passat + speak += "mig quart passat" + next_hour = True + elif dt.minute < 14: + # mig quart passat + speak += "mig quart ben passat" + next_hour = True + elif dt.minute < 17: + speak += "un quart" + next_hour = True + elif dt.minute < 20: + speak += "un quart tocat" + next_hour = True + elif dt.minute < 22: + speak += "un quart ben tocat" + next_hour = True + elif dt.minute < 24: + speak += "un quart i mig" + next_hour = True + elif dt.minute < 27: + speak += "un quart i mig passat" + next_hour = True + elif dt.minute < 29: + speak += "un quart i mig ben passat" + next_hour = True + elif dt.minute < 32: + speak += "dos quarts" + next_hour = True + elif dt.minute < 35: + speak += "dos quarts tocats" + next_hour = True + elif dt.minute < 37: + speak += "dos quarts ben tocats" + next_hour = True + elif dt.minute < 39: + speak += "dos quarts i mig" + next_hour = True + elif dt.minute < 42: + speak += "dos quarts i mig passats" + next_hour = True + elif dt.minute < 44: + speak += "dos quarts i mig ben passats" + next_hour = True + elif dt.minute < 47: + speak += "tres quarts" + next_hour = True + elif dt.minute < 50: + speak += "tres quarts tocats" + next_hour = True + elif dt.minute < 52: + speak += "tres quarts ben tocats" + next_hour = True + elif dt.minute < 54: + speak += "tres quarts i mig" + next_hour = True + elif dt.minute < 57: + speak += "tres quarts i mig passats" + next_hour = True + elif dt.minute < 59: + speak += "tres quarts i mig ben passats" + next_hour = True + elif dt.minute == 59: + next_hour = False + + if next_hour == True: + next_hour = (dt.hour + 1) % 12 + if next_hour == 0: + speak += " de dotze" + if dt.hour == 11: + speak += " del migdia" + else: + speak += " de la nit" + + elif next_hour == 1: + speak += " d'una" + if dt.hour == 12: + speak += " de la tarda" + else: + speak += " de la matinada" + elif next_hour == 2: + speak += " de dues" + if dt.hour == 13: + speak += " de la tarda" + else: + speak += " de la nit" + + elif next_hour == 11: + speak += " d'onze" + if dt.hour == 22: + speak += " de la nit" + else: + speak += " del matí" + else: + speak += " de " + pronounce_number_ca(next_hour) + if dt.hour == 0 and dt.hour < 5: + speak += " de la matinada" + elif dt.hour >= 5 and dt.hour < 11: + speak += " del matí" + elif dt.hour == 11: + speak += " del migdia" + elif dt.hour >= 12 and dt.hour <= 17: + speak += " de la tarda" + elif dt.hour >= 18 and dt.hour < 20: + speak += " del vespre" + elif dt.hour >= 21 and dt.hour <= 23: + speak += " de la nit" + + else: + hour = dt.hour % 12 + if dt.minute == 59: + hour = (hour + 1) % 12 + if hour == 0: + speak += "les dotze" + elif hour == 1: + speak += "la una" + elif hour == 2: + speak += "les dues" + else: + speak += "les " + pronounce_number_ca(hour) + + if dt.minute == 0: + speak += " en punt" + elif dt.minute > 1 and dt.minute < 5: + if hour == 1: + speak += " tocada" + else: + speak += " tocades" + elif dt.minute < 7: + if hour == 1: + speak += " ben tocada" + else: + speak += " ben tocades" + + if dt.hour == 0: + if hour == 1: + speak += " de la matinada" + else: + speak += " de la nit" + elif dt.hour < 6: + if hour == 6: + speak += " del matí" + else: + speak += " de la matinada" + elif dt.hour < 12: + if hour == 12: + speak += " del migdia" + else: + speak += " del matí" + elif dt.hour == 12: + if hour == 1: + speak += " de la tarda" + else: + speak += " del migdia" + elif dt.hour < 19: + if hour == 7: + speak += " del vespre" + else: + speak += " de la tarda" + elif dt.hour < 21: + if hour == 9: + speak += " de la nit" + else: + speak += " del vespre" + elif dt.hour <= 23: + speak += " de la nit" + + elif variant == TimeVariantCA.SPANISH_LIKE: + # Prepare for "tres menys quart" ?? + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0 or hour == 12: + speak += "les dotze" + elif hour == 1 or hour == 13: + speak += "la una" + elif hour < 13: + speak = "les " + pronounce_number_ca(hour) + else: + speak = "les " + pronounce_number_ca(hour - 12) + + if minute != 0: + # les hores especials + if minute == 15: + speak += " i quart" + elif minute == 30: + speak += " i mitja" + elif minute == -15: + speak += " menys quart" + else: # sis i nou. set i veint-i-cinc + if minute > 0: + speak += " i " + pronounce_number_ca(minute) + else: # si son las set menys vint, no posem la "i" + speak += " " + pronounce_number_ca(minute) + + # Default Watch Time Sytem + else: + if use_24hour: + # simply speak the number + if dt.hour == 1: + speak += "la una" + elif dt.hour == 2: + speak += "les dues" + elif dt.hour == 21: + speak += "les vint-i-una" + elif dt.hour == 22: + speak += "les vint-i-dues" + else: + speak += "les " + pronounce_number_ca(dt.hour) + + if dt.minute > 0: + speak += " i " + pronounce_number_ca(dt.minute) + + else: + # speak number and add daytime identifier + # (equivalent to "in the morning") + if dt.hour == 0: + speak += "les dotze" + # 1 and 2 are pronounced in female form when talking about hours + elif dt.hour == 1 or dt.hour == 13: + speak += "la una" + elif dt.hour == 2 or dt.hour == 14: + speak += "les dues" + elif dt.hour < 13: + speak = "les " + pronounce_number_ca(dt.hour) + else: + speak = "les " + pronounce_number_ca(dt.hour - 12) + + # exact time + if dt.minute == 0: + # 3:00 + speak += " en punt" + # else + else: + speak += " i " + pronounce_number_ca(dt.minute) + + # TODO: review day-periods + if use_ampm: + if dt.hour == 0: + speak += " de la nit" + elif dt.hour >= 1 and dt.hour < 6: + speak += " de la matinada" + elif dt.hour >= 6 and dt.hour < 12: + speak += " del matí" + elif dt.hour == 12: + speak += " del migdia" + elif dt.hour >= 13 and dt.hour <= 18: + speak += " de la tarda" + elif dt.hour >= 19 and dt.hour < 21: + speak += " del vespre" + elif dt.hour != 0 and dt.hour != 12: + speak += " de la nit" + return speak + + +def extract_datetime_ca(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + hyphens = ["'", "_"] + noise_words = ["el", "l", "els", "la", "les", "es", "sa", "ses", + "d", "de", "del", "dels"] + # add final space + s = s + " " + + s = s.lower() + + for word in symbols: + s = s.replace(word, "") + + for word in hyphens: + s = s.replace(word, " ") + + for word in noise_words: + s = s.replace(" " + word + " ", " ") + + # handle synonims, plurals and equivalents, "demà ben d'hora" = "demà de matí" + synonims = {"abans": ["abans-d"], + "vinent": ["que vé", "que ve", "que bé", "que be"], + "migdia": ["mig dia"], + "mitjanit": ["mitja nit"], + "matinada": ["matinades", "ben hora ben hora"], + "matí": ["matins", "dematí", "dematins", "ben hora"], + "tarda": ["tardes", "vesprada", "vesprades", "vespraes"], + "nit": ["nits", "vespre", "vespres", "horabaixa", "capvespre"], + "demà": ["endemà"], + "diàriament": ["diària", "diàries", "cada dia", "tots dies"], + "setmanalment": ["setmanal", "setmanals", "cada setmana", "totes setmanes"], + "quinzenalment": ["quinzenal", "quinzenals", "cada quinzena", "totes quinzenes"], + "mensualment": ["mensual", "mensuals", "cada mes", "tots mesos"], + "anualment": ["anual", "anuals", "cada any", "tots anys"], + "demàpassat": ["demà-passat", "demà passat", "passat demà", "despús-demà", "despús demà"], + "demàpassatpassat": ["demàpassat passat", "passat demàpassat", + "demàpassat no altre", "demàpassat altre"], + "abansahir": ["abans ahir", "despús ahir", "despús-ahir"], + "abansabansahir": ["abans abansahir", "abansahir no altre", "abansahir altre", + "abansahir no altre", "abansahir altre"], + "segon": ["segons"], + "minut": ["minuts"], + "quart": ["quarts"], + "hora": ["hores"], + "dia": ["dies"], + "setmana": ["setmanes"], + "quinzena": ["quinzenes"], + "mes": ["mesos"], + "any": ["anys"], + "tocat": ["tocats"], + "a": ["al", "als"] + } + for syn in synonims: + for word in synonims[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + + # remove final space + if s[-1] == " ": + s = s[:-1] + + return s + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['matí', 'tarda', 'nit'] + time_indicators = ["em", "a", "a les", "cap a", "vora", "després", "estas", + "no", "dia", "hora"] + days = ['dilluns', 'dimarts', 'dimecres', + 'dijous', 'divendres', 'dissabte', 'diumenge'] + months = ['gener', 'febrer', 'març', 'abril', 'maig', 'juny', + 'juliol', 'agost', 'setembre', 'octubre', 'novembre', + 'desembre'] + monthsShort = ['gen', 'feb', 'març', 'abr', 'maig', 'juny', 'jul', 'ag', + 'set', 'oct', 'nov', 'des'] + nexts = ["pròxim", "pròxima", "vinent"] + suffix_nexts = ["següent", "després"] + lasts = ["últim", "última", "darrer", "darrera", "passat", "passada"] + suffix_lasts = ["passada", "passat", "anterior", "abans"] + nxts = ["passat", "després", "segueix", "seguit", "seguida", "següent", "pròxim", "pròxima"] + prevs = ["abans", "prèvia", "previamente", "anterior"] + froms = ["partir", "dins", "des", "a", + "després", "pròxima", "pròxim", "del", "de"] + thises = ["aquest", "aquesta", "aqueix", "aqueixa", "este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "avui" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "demà" and not fromFlag: + dayOffset += 1 + used += 1 + elif word == "ahir" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "abansahir") and not fromFlag: + dayOffset -= 2 + used += 1 + elif word == "abansabansahir" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow and after after tomorrow + elif word == "demàpassat" and not fromFlag: + dayOffset += 2 + used = 1 + elif word == "demàpassatpassat" and not fromFlag: + dayOffset += 3 + used = 1 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "dia": + if wordNext == "després" or wordNext == "abans": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "setmana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "any" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maig + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maig 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maig + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maig dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("avui") + validFollowups.append("demà") + validFollowups.append("ahir") + validFollowups.append("abansahir") + validFollowups.append("abansabansahir") + validFollowups.append("demàpassat") + validFollowups.append("ara") + validFollowups.append("ja") + validFollowups.append("abans") + + # TODO debug word "passat" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "demà" and wordNext == "ahir") and not ( + word == "passat" or word == "abans" or word == "em"): + used = 2 + fromFlag = True + if wordNext == "demà": + dayOffset += 1 + elif wordNext == "ahir": + dayOffset -= 1 + elif wordNext == "abansahir": + dayOffset -= 2 + elif wordNext == "abansabansahir": + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "dia": + used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + if wordNextNextNext == "dia": + used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "migdia": + hrAbs = 12 + used += 1 + elif word == "mijanit": + hrAbs = 0 + used += 1 + elif word == "matí": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarda": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "mitja" and wordNext == "tarda": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "mig" and wordNext == "matí": + if not hrAbs: + hrAbs = 10 + used += 2 + elif word == "vespre" or (word == "final" and wordNext == "tarda"): + if not hrAbs: + hrAbs = 19 + used += 2 + elif word == "final" and wordNext == "matí": + if not hrAbs: + hrAbs = 11 + used += 2 + elif word == "matinada": + if not hrAbs: + hrAbs = 4 + used += 1 + elif word == "nit": + if not hrAbs: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "mitja": + minOffset = 30 + elif wordPrev == "quart": + minOffset = 15 + elif wordPrevPrev == "quart": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "matí": + remainder = "am" + used += 1 + elif (wordNext == "tarda" or wordNext == "vespre"): + remainder = "pm" + used += 1 + elif wordNext == "nit": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "matí": + remainder = "am" + used = 2 + elif wordNext in thises and (wordNextNext == "tarda" or wordNextNext == "vespre"): + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "nit": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "matí" or + timeQualifier == "tarda"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarda" or + wordNext == "vespre"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "matí"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minut": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segon": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punt"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punt": + used += 2 + if (wordNextNextNext == "tarda" or wordNextNextNext == "vespre"): + remainder = "pm" + used += 1 + elif wordNextNextNext == "matí": + remainder = "am" + used += 1 + elif wordNextNextNext == "nit": + if 0 > int(strHH) > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punt": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if (hrAbs or 0) != -1 and (minAbs or 0) != -1: + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + resultStr = _ca_pruning(resultStr) + return [extractedDate, resultStr] + + +def _ca_pruning(text, symbols=True, accents=False, agressive=True): + # agressive ca word pruning + words = ["l", "la", "el", "els", "les", "de", "dels", + "ell", "ells", "me", "és", "som", "al", "a", "dins", "per", + "aquest", "aquesta", "això", "aixina", "en", "aquell", "aquella", + "va", "vam", "vaig", "quin", "quina"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", "¡", "¿"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("'", " ").replace("_", " ") + # accents=False + if accents: + accents = {"a": ["á", "à", "ã", "â"], + "e": ["ê", "è", "é"], + "i": ["í", "ï"], + "o": ["ò", "ó"], + "u": ["ú", "ü"], + "c": ["ç"], + "ll": ["l·l"], + "n": ["ñ"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text diff --git a/ovos_date_parser/dates_cs.py b/ovos_date_parser/dates_cs.py new file mode 100644 index 0000000..fa296c5 --- /dev/null +++ b/ovos_date_parser/dates_cs.py @@ -0,0 +1,1187 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_cs import pronounce_number_cs, _ORDINAL_BASE_CS, extract_number_cs, \ + _convert_words_to_numbers_cs +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_CZECH = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', + 'červenec', 'srpen', 'září', 'říjen', 'listopad', + 'prosinec'] + +# Time +_TIME_UNITS_CONVERSION = { + 'mikrosekund': 'microseconds', + 'milisekund': 'milliseconds', + 'sekundu': 'seconds', + 'sekundy': 'seconds', + 'sekund': 'seconds', + 'minutu': 'minutes', + 'minuty': 'minutes', + 'minut': 'minutes', + 'hodin': 'hours', + 'den': 'days', # 1 day + 'dny': 'days', # 2-4 days + 'dnů': 'days', # 5+ days + 'dní': 'days', # 5+ days - different inflection + 'dne': 'days', # a half day + 'týden': 'weeks', + 'týdny': 'weeks', + 'týdnů': 'weeks' +} + + +def nice_time_cs(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_cs(int(string[0])) + " " + speak += pronounce_number_cs(int(string[1])) + else: + speak = pronounce_number_cs(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "sto" + else: + if string[3] == '0': + speak += pronounce_number_cs(0) + " " + speak += pronounce_number_cs(int(string[4])) + else: + speak += pronounce_number_cs(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "půlnoc" + elif dt.hour == 12 and dt.minute == 0: + return "poledne" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "čtvrt po " + pronounce_number_cs(hour) + elif dt.minute == 30: + speak = "půl po " + pronounce_number_cs(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "třičtvrtě na " + pronounce_number_cs(next_hour) + else: + speak = pronounce_number_cs(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + " hodin" + else: + if dt.minute < 10: + speak += " oh" + speak += " " + pronounce_number_cs(dt.minute) + + if use_ampm: + if dt.hour > 11: + speak += " p.m." + else: + speak += " a.m." + + return speak + + +def extract_duration_cs(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Czech inflection for time: minuta,minuty,minut - safe to use minut as pattern + # For day: den, dny, dnů - short patern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ay]?" + text = _convert_words_to_numbers_cs(text) + + for (unit_cs, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_cs) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_cs(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize czech inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("dvoje", "2").replace("dvojice", "2") \ + .replace("dnes večer", "večer").replace("dnes v noci", "noci") # \ + # .replace("tento večer", "večer") + # .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ + # .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ + # .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ + # .replace("decades", "decade") \ + # .replace("tisíciletí", "milénium") + # .replace("oclock", "o'clock") + wordList = s.split() + + for idx, word in enumerate(wordList): + # word = word.replace("'s", "") + ########## + # Czech Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: třicátého prvního > 31 + count_ordinals = 0 + if word == "prvního": + count_ordinals = 1 # These two have different format + elif word == "třetího": + count_ordinals = 3 + elif word.endswith("ého"): + tmp = word[:-3] + tmp += ("ý") + for nr, name in _ORDINAL_BASE_CS.items(): + if name == tmp: + count_ordinals = nr + + # If number is bigger than 19 chceck if next word is also ordinal + # and count them together + if count_ordinals > 19: + if wordList[idx + 1] == "prvního": + count_ordinals += 1 # These two have different format + elif wordList[idx + 1] == "třetího": + count_ordinals += 3 + elif wordList[idx + 1].endswith("ého"): + tmp = wordList[idx + 1][:-3] + tmp += ("ý") + for nr, name in _ORDINAL_BASE_CS.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized valu into word + if count_ordinals > 20: + # If counted number is grather than 20, clear next word so it is not used again + wordList[idx + 1] = "" + ########## + # Remove inflection from czech months + + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['ráno', 'dopoledne'] + timeQualifiersPM = ['odpoledne', 'večer', 'noc', 'noci'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['na', 'v', 'do', 'na', 'tento', + 'okolo', 'toto', 'během', 'za', 'této'] + days = ['pondělí', 'úterý', 'středa', + 'čtvrtek', 'pátek', 'sobota', 'neděle'] + months = _MONTHS_CZECH + recur_markers = days + [d + 'ho' for d in days] + \ + ['víkend', 'všední'] # Check this + monthsShort = ['led', 'úno', 'bře', 'dub', 'kvě', 'čvn', 'čvc', 'srp', + 'zář', 'říj', 'lis', 'pro'] + year_multiples = ["desetiletí", "století", "tisíciletí"] + day_multiples = ["týden", "měsíc", "rok"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + + word = _text_cs_inflection_normalize(word, 2) + wordPrevPrev = _text_cs_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + wordPrev = _text_cs_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + wordNext = _text_cs_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + wordNextNext = _text_cs_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + # word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + # if word == "před" and dayOffset: + # dayOffset = - dayOffset + # used += 1 + if word == "nyní" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_cs(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "desetiletí": + yearOffset = multiplier * 10 + elif wordNext == "století": + yearOffset = multiplier * 100 + elif wordNext == "tisíciletí": + yearOffset = multiplier * 1000 + # couple of + elif word == "2" and wordNext == "krát" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "desetiletí": + yearOffset = multiplier * 10 + elif wordNextNext == "století": + yearOffset = multiplier * 100 + elif wordNextNext == "tisíciletí": + yearOffset = multiplier * 1000 + elif word == "2" and wordNext == "krát" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "rok": + yearOffset = multiplier + elif wordNextNext == "měsíc": + monthOffset = multiplier + elif wordNextNext == "týden": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dnes" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "zítra" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "den" and wordNext == "před" and wordNextNext == "včera" and not fromFlag: + dayOffset = -2 + used += 3 + elif word == "před" and wordNext == "včera" and not fromFlag: + dayOffset = -2 + used += 2 + elif word == "včera" and not fromFlag: + dayOffset = -1 + used += 1 + elif (word == "den" and + wordNext == "po" and + wordNextNext == "zítra" and + not fromFlag and + (not wordPrev or not wordPrev[0].isdigit())): + dayOffset = 2 + used = 3 + if wordPrev == "ten": + start -= 1 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "den": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + if wordPrevPrev == "před": + dayOffset = -dayOffset + used += 1 + start -= 1 + + elif word == "týden" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "poslední": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "měsíc" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poslední": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "rok" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "další" or wordPrev == "příští": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poslední": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "další" or wordPrev == "příští": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "poslední": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + # Convert czech months to english + datestr = _MONTHS_CONVERSION.get(m) + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == " " and wordPrevPrev[0].isdigit())): + if wordPrev == " " and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # if no date indicators found, it may not be the month of May + # may "i/we" ... + # "... may be" + # elif word == 'may' and wordNext in ['i', 'we', 'be']: + # datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("dnes") + validFollowups.append("zítra") + validFollowups.append("včera") + validFollowups.append("další") + validFollowups.append("příští") + validFollowups.append("poslední") + validFollowups.append("teď") + validFollowups.append("toto") + validFollowups.append("této") + validFollowups.append("tento") + if (word == "od" or word == "po" or word == "do") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "zítra": + dayOffset += 1 + elif wordNext == "včera": + dayOffset -= 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "další" or wordPrev == "příští": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "poslední": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and ( + words[start - 1] == "toto" or words[start - 1] == "této" or words[start - 1] == "tento"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + word = _text_cs_inflection_normalize(word, 2) + wordPrevPrev = _text_cs_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + wordPrev = _text_cs_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + wordNext = _text_cs_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + wordNextNext = _text_cs_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "poledne": + hrAbs = 12 + used += 1 + elif word == "půlnoc": + hrAbs = 0 + used += 1 + elif word == "ráno": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "odpoledne": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "večer": + if hrAbs is None: + hrAbs = 19 + used += 1 + if (wordNext != "" and wordNext[0].isdigit() and ":" in wordNext): + used -= 1 + elif word == "noci" or word == "noc": + if hrAbs is None: + hrAbs = 22 + # used += 1 + # if ((wordNext !='' and not wordNext[0].isdigit()) or wordNext =='') and \ + # ((wordNextNext !='' and not wordNextNext[0].isdigit())or wordNextNext =='') : + # used += 1 + # used += 1 ## NOTE this breaks other tests, TODO refactor me! + + # couple of time_unit + elif word == "2" and wordNext == "krát" and \ + wordNextNext in ["hodin", "minut", "sekund"]: + used += 3 + if wordNextNext == "hodin": + hrOffset = 2 + elif wordNextNext == "minut": + minOffset = 2 + elif wordNextNext == "sekund": + secOffset = 2 + # parse half an hour, quarter hour + elif word == "hodin" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "půl": + minOffset = 30 + elif wordPrev == "čtvrt": + minOffset = 15 + elif wordPrevPrev == "třičtvrtě": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + words[idx - 2] = "" + elif wordPrev == "během": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "tato" or wordPrevPrev == "této": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minut" and wordPrev == "za": + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "sekund" and wordPrev == "za": + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "večer" or wordNext == "noci" or wordNextNext == "večer" \ + or wordNextNext == "noci" or wordPrev == "večer" \ + or wordPrev == "noci" or wordPrevPrev == "večer" \ + or wordPrevPrev == "noci" or wordNextNextNext == "večer" \ + or wordNextNextNext == "noci": + remainder = "pm" + used += 1 + if wordPrev == "večer" or wordPrev == "noci": + words[idx - 1] = "" + if wordPrevPrev == "večer" or wordPrevPrev == "noci": + words[idx - 2] = "" + if wordNextNext == "večer" or wordNextNext == "noci": + used += 1 + if wordNextNextNext == "večer" or wordNextNextNext == "noci": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "ráno": + # remainder = "am" + # used += 3 + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "odpoledne": + # remainder = "pm" + # used += 3 + # elif wordNext == "in" and wordNextNext == "the" and \ + # words[idx + 3] == "večer": + # remainder = "pm" + # used += 3 + elif wordNext == "ráno": + remainder = "am" + used += 2 + elif wordNext == "odpoledne": + remainder = "pm" + used += 2 + elif wordNext == "večer": + remainder = "pm" + used += 2 + elif wordNext == "toto" and wordNextNext == "ráno": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "na" and wordNextNext == "odpoledne": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "na" and wordNextNext == "večer": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "v" and wordNextNext == "noci": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if (int(strNum) > 100): # and #Check this + # ( + # wordPrev == "o" or + # wordPrev == "oh" + # )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hodin": + used += 1 + elif ( + (wordNext == "hodin" or + remainder == "hodin") and + word[0] != '0' and + # (wordPrev != "v" and wordPrev != "na") + wordPrev == "za" + and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "minut" or \ + remainder == "minut": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "sekund" \ + or remainder == "sekund": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hodin" or \ + remainder == "hodin": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "hodin" or + remainder == "hodin"): + used += 1 + elif ( + wordNext == "" or wordNext == "hodin" or + ( + (wordNext == "v" or wordNext == "na") and + ( + wordNextNext == timeQualifier + ) + ) or wordNext == 'večer' or + wordNextNext == 'večer'): + + strHH = strNum + strMM = "00" + if wordNext == "hodin": + used += 1 + if (wordNext == "v" or wordNext == "na" + or wordNextNext == "v" or wordNextNext == "na"): + used += (1 if (wordNext == + "v" or wordNext == "na") else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "hodin": + remainder = "" + + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + if (not military and + remainder not in ['am', 'pm', 'hodin', 'minut', 'sekund'] and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if wordPrev == "brzy": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "pozdě": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "toto" or wordPrev == "této": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "toto" or wordPrev == "této": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "a" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def _text_cs_inflection_normalize(word, arg): + """ + Czech Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + if arg == 1: # _extract_whole_number_with_text_cs + # Number one (jedna) + if len(word) == 5 and word.startswith("jed"): + suffix = 'en', 'no', 'ny' + if word.endswith(suffix, 3): + word = "jedna" + + # Number two (dva) + elif word == "dvě": + word = "dva" + + elif arg == 2: # extract_datetime_cs TODO: This is ugly + if word == "hodina": + word = "hodin" + if word == "hodiny": + word = "hodin" + if word == "hodinu": + word = "hodin" + if word == "minuta": + word = "minut" + if word == "minuty": + word = "minut" + if word == "minutu": + word = "minut" + if word == "minutu": + word = "minut" + if word == "sekunda": + word = "sekund" + if word == "sekundy": + word = "sekund" + if word == "sekundu": + word = "sekund" + if word == "dní": + word = "den" + if word == "dnů": + word = "den" + if word == "dny": + word = "den" + if word == "týdny": + word = "týden" + if word == "týdnů": + word = "týden" + if word == "měsíců": + word = "měsíc" + if word == "měsíce": + word = "měsíc" + if word == "měsíci": + word = "měsíc" + if word == "roky": + word = "rok" + if word == "roků": + word = "rok" + if word == "let": + word = "rok" + if word == "včerejšku": + word = "včera" + if word == "zítřku": + word = "zítra" + if word == "zítřejší": + word = "zítra" + if word == "ranní": + word = "ráno" + if word == "dopolední": + word = "dopoledne" + if word == "polední": + word = "poledne" + if word == "odpolední": + word = "odpoledne" + if word == "večerní": + word = "večer" + if word == "noční": + word = "noc" + if word == "víkendech": + word = "víkend" + if word == "víkendu": + word = "víkend" + if word == "všedních": + word = "všední" + if word == "všedním": + word = "všední" + + # Months + if word == "únoru": + word = "únor" + elif word == "červenci": + word = "červenec" + elif word == "července": + word = "červenec" + elif word == "listopadu": + word = "listopad" + elif word == "prosinci": + word = "prosinec" + + elif word.endswith("nu") or word.endswith("na"): + tmp = word[:-2] + tmp += ("en") + for name in _MONTHS_CZECH: + if name == tmp: + word = name + + return word diff --git a/ovos_date_parser/dates_da.py b/ovos_date_parser/dates_da.py new file mode 100644 index 0000000..e195b7c --- /dev/null +++ b/ovos_date_parser/dates_da.py @@ -0,0 +1,783 @@ +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_da import pronounce_ordinal_da, pronounce_number_da, is_ordinal_da +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + +_MONTHS_DA = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + + +def nice_time_da(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + if dt.hour == 1: + speak += "et" # 01:00 is "et" not "en" + else: + speak += pronounce_number_da(dt.hour) + if not dt.minute == 0: + if dt.minute < 10: + speak += ' nul' + speak += " " + pronounce_number_da(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "midnat" + if dt.hour == 12 and dt.minute == 0: + return "middag" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if dt.hour == 0: + speak += pronounce_number_da(12) + elif dt.hour <= 13: + if dt.hour == 1 or dt.hour == 13: # 01:00 and 13:00 is "et" + speak += 'et' + else: + speak += pronounce_number_da(dt.hour) + else: + speak += pronounce_number_da(dt.hour - 12) + + if not dt.minute == 0: + if dt.minute < 10: + speak += ' nul' + speak += " " + pronounce_number_da(dt.minute) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + # 12:01 - 17:59 nachmittags/afternoon + speak += " om eftermiddagen" + elif dt.hour < 22: + # 18:00 - 21:59 abends/evening + speak += " om aftenen" + else: + # 22:00 - 23:59 nachts/at night + speak += " om natten" + elif dt.hour < 3: + # 00:01 - 02:59 nachts/at night + speak += " om natten" + else: + # 03:00 - 11:59 morgens/in the morning + speak += " om morgenen" + + return speak + + +def _nice_ordinal_da(text, speech=True): + # check for months for declension of ordinals before months + # depending on articles/prepositions + normalized_text = text + words = text.split() + + for idx, word in enumerate(words): + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordPrev = words[idx - 1] if idx > 0 else "" + if word[-1:] == ".": + if word[:-1].isdecimal(): + if wordNext.lower() in _MONTHS_DA: + word = pronounce_ordinal_da(int(word[:-1])) + if wordPrev.lower() in ["om", "den", "fra", "til", + "(fra", "(om", "til"]: + word += "n" + elif wordPrev.lower() not in ["den"]: + word += "r" + words[idx] = word + normalized_text = " ".join(words) + return normalized_text + + +def extract_datetime_da(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation + and capitalization among other things. + + 'am' is a preposition, so cannot currently be used + for 12 hour date format + """ + + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' den ', ' ').replace(' det ', ' ').replace(' om ', + ' ').replace( + ' om ', ' ') \ + .replace(' på ', ' ').replace(' om ', ' ') + wordList = s.split() + + for idx, word in enumerate(wordList): + if is_ordinal_da(word) is not False: + word = str(is_ordinal_da(word)) + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['tidlig', + 'morgen', + 'morgenen', + 'formidag', + 'formiddagen', + 'eftermiddag', + 'eftermiddagen', + 'aften', + 'aftenen', + 'nat', + 'natten'] + markers = ['i', 'om', 'på', 'klokken', 'ved'] + days = ['mandag', 'tirsdag', 'onsdag', + 'torsdag', 'fredag', 'lørdag', 'søndag'] + months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'desember'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug', + 'sep', 'okt', 'nov', 'des'] + + validFollowups = days + months + monthsShort + validFollowups.append("i dag") + validFollowups.append("morgen") + validFollowups.append("næste") + validFollowups.append("forige") + validFollowups.append("nu") + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag and wordPrev != "om" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen + dayOffset = 1 + used += 1 + elif word == "overmorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dage": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "uge" or word == "uger" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev[:5] == "forige": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "måned" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:5] == "forige": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "år" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev[:6] == " næste": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:6] == "næste": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext == "morgen": + # morgen means morning if preceded by + # the day of the week + words[idx + 1] = "tidlig" + if wordPrev[:6] == "næste": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev[:5] == "forige": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + + if ( + word == "fra" or word == "til" or word == "om") and wordNext \ + in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgenen" and \ + wordPrev != "om" and \ + wordPrev not in days: + # morgen means tomorrow if not "am Morgen" and not + # [day of the week] morgen: + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext[:6] == "næste": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext[:5] == "forige": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1].startswith("denne"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word[:6] == "middag": + hrAbs = 12 + used += 1 + elif word[:11] == "midnat": + hrAbs = 0 + used += 1 + elif word == "morgenen" or ( + wordPrev == "om" and word == "morgenen") or word == "tidlig": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word[:11] == "eftermiddag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word[:5] == "aften": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif word == "time" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev[:4] == "halv": + minOffset = 30 + elif wordPrev == "kvarter": + minOffset = 15 + elif wordPrev == "trekvarter": + minOffset = 45 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "aften": + remainder = "pm" + used += 1 + elif wordNext == "om" and wordNextNext == "morgenen": + remainder = "am" + used += 2 + elif wordNext == "om" and wordNextNext == "eftermiddagen": + remainder = "pm" + used += 2 + elif wordNext == "om" and wordNextNext == "aftenen": + remainder = "pm" + used += 2 + elif wordNext == "morgen": + remainder = "am" + used += 1 + elif wordNext == "eftermiddag": + remainder = "pm" + used += 1 + elif wordNext == "aften": + remainder = "pm" + used += 1 + elif wordNext == "i" and wordNextNext == "morgen": + remainder = "am" + used = 2 + elif wordNext == "i" and wordNextNext == "eftermiddag": + remainder = "pm" + used = 2 + elif wordNext == "i" and wordNextNext == "aften": + remainder = "pm" + used = 2 + elif wordNext == "natten": + if strHH > 4: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "aftenen" or + timeQualifier == "eftermiddagen"): + strHH += 12 # what happens when strHH is 24? + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "time" and int(word) < 100: + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "minut": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "sekund": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "time": + strHH = word + used += 1 + isTime = True + if wordNextNext == timeQualifier: + strMM = "" + if wordNextNext[:11] == "eftermiddag": + used += 1 + remainder = "pm" + elif wordNextNext == "om" and wordNextNextNext == \ + "eftermiddagen": + used += 2 + remainder = "pm" + elif wordNextNext[:5] == "aften": + used += 1 + remainder = "pm" + elif wordNextNext == "om" and wordNextNextNext == \ + "aftenen": + used += 2 + remainder = "pm" + elif wordNextNext[:6] == "morgen": + used += 1 + remainder = "am" + elif wordNextNext == "om" and wordNextNextNext == \ + "morgenen": + used += 2 + remainder = "am" + elif wordNextNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif is_numeric(wordNextNext): + strMM = wordNextNext + used += 1 + if wordNextNextNext == timeQualifier: + if wordNextNextNext[:11] == "eftermiddag": + used += 1 + remainder = "pm" + elif wordNextNextNext == "om" and \ + wordNextNextNextNext == \ + "eftermiddagen": + used += 2 + remainder = "pm" + elif wordNextNextNext[:6] == "natten": + used += 1 + remainder = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "natten": + used += 2 + remainder = "pm" + elif wordNextNextNext[:7] == "morgenen": + used += 1 + remainder = "am" + elif wordNextNextNext == "om" and \ + wordNextNextNextNext == "morgenen": + used += 2 + remainder = "am" + elif wordNextNextNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + elif wordNext == timeQualifier: + strHH = word + strMM = 00 + isTime = True + if wordNext[:10] == "eftermidag": + used += 1 + remainder = "pm" + elif wordNext == "om" and \ + wordNextNext == "eftermiddanen": + used += 2 + remainder = "pm" + elif wordNext[:7] == "aftenen": + used += 1 + remainder = "pm" + elif wordNext == "om" and wordNextNext == "aftenen": + used += 2 + remainder = "pm" + elif wordNext[:7] == "morgenen": + used += 1 + remainder = "am" + elif wordNext == "ao" and wordNextNext == "morgenen": + used += 2 + remainder = "am" + elif wordNext == "natten": + used += 1 + if 8 <= int(word) <= 12: + remainder = "pm" + else: + remainder = "am" + + # if timeQualifier != "": + # military = True + # else: + # isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "tidlig": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "sen": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "og" and words[idx - 1] == "" \ + and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_de.py b/ovos_date_parser/dates_de.py new file mode 100644 index 0000000..26c2408 --- /dev/null +++ b/ovos_date_parser/dates_de.py @@ -0,0 +1,817 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_de import pronounce_number_de, _get_ordinal_index, is_number_de, is_numeric_de, \ + _convert_words_to_numbers_de +from ovos_utils.time import now_local + + +def nice_time_de(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'ein uhr eins' for speech or '01:01 Uhr' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + string = "" + if not speech: + if use_24hour: + string = f"{dt.strftime('%H:%M')} uhr" + else: + string = f"{dt.strftime('%I:%M')} uhr" + + # Generate a speakable version of the time" + elif use_24hour: + if dt.hour == 1: + string += "ein" # 01:00 is "ein Uhr" not "eins Uhr" + else: + string += pronounce_number_de(dt.hour) + string += " uhr" + if not dt.minute == 0: # zero minutes are not pronounced + string += " " + pronounce_number_de(dt.minute) + else: + next_hour = (dt.hour + 1) % 12 or 12 + if dt.hour == 0 and dt.minute == 0: + return "mitternacht" + elif dt.hour == 12 and dt.minute == 0: + return "mittag" + elif dt.minute == 15: + string = "viertel " + pronounce_number_de(next_hour) + elif dt.minute == 30: + string = "halb " + pronounce_number_de(next_hour) + elif dt.minute == 45: + string = "dreiviertel " + pronounce_number_de(next_hour) + else: + hour = dt.hour % 12 + if hour == 1: # 01:00 and 13:00 is "ein Uhr" not "eins Uhr" + string += 'ein' + else: + string += pronounce_number_de(hour) + string += " uhr" + + if not dt.minute == 0: + string += " " + pronounce_number_de(dt.minute) + + if use_ampm: + if 3 <= dt.hour < 12: + string += " morgens" # 03:00 - 11:59 morgens/in the morning + elif 12 <= dt.hour < 18: + string += " nachmittags" # 12:01 - 17:59 nachmittags/afternoon + elif 18 <= dt.hour < 22: + string += " abends" # 18:00 - 21:59 abends/evening + else: + string += " nachts" # 22:00 - 02:59 nachts/at night + + return string + + +def extract_duration_de(text): + """ + Convert a german phrase into a number of seconds + Convert things like: + "10 Minuten" + "3 Tage 8 Stunden 10 Minuten und 49 Sekunden" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + text = text.lower() + # die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben + time_units = { + 'microseconds': 'mikrosekunden', + 'milliseconds': 'millisekunden', + 'seconds': 'sekunden', + 'minutes': 'minuten', + 'hours': 'stunden', + 'days': 'tage', + 'weeks': 'wochen' + } + + # Einzahl und Mehrzahl + pattern = r"(?:^|\s)(?P\d+(?:[.,]?\d+)?\b)(?:\s+|\-)(?P{unit}[nes]?[sn]?\b)" + + text = _convert_words_to_numbers_de(text) + + for (unit_en, unit_de) in time_units.items(): + unit_pattern = pattern.format( + unit=unit_de[:-1]) # remove 'n'/'e' from unit + time_units[unit_en] = 0 + + def repl(match): + value = match.group("value").replace(",", ".") + time_units[unit_en] += float(value) + return '' + + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_de(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation + and capitalization among other things. + + 'am' is a preposition, so cannot currently be used + for 12 hour date format + """ + + s = _convert_words_to_numbers_de(s) + s = s.lower().replace('?', '').replace(' der ', ' ').replace(' den ', ' ') \ + .replace(' an ', ' ').replace(' am ', ' ').replace(' auf ', ' ') \ + .replace(' um ', ' ') + wordList = s.split() + + for idx, word in enumerate(wordList): + ordinal = _get_ordinal_index(word) + if ordinal: + wordList[idx] = ordinal + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', + 'mittag', 'mittags', 'nachmittag', 'nachmittags', + 'abend', 'abends', 'nacht', 'nachts', 'pm', 'p.m.'] + eveningQualifiers = ['nachmittag', 'nachmittags', 'abend', 'abends', 'nacht', + 'nachts', 'pm', 'p.m.'] + markers = ['in', 'am', 'gegen', 'bis', 'für'] + days = ['montag', 'dienstag', 'mittwoch', + 'donnerstag', 'freitag', 'samstag', 'sonntag'] + months = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + monthsShort = ['jan', 'feb', 'mär', 'apr', 'mai', 'juni', 'juli', 'aug', + 'sept', 'oct', 'nov', 'dez'] + + validFollowups = days + months + monthsShort + validFollowups.append("heute") + validFollowups.append("morgen") + validFollowups.append("nächste") + validFollowups.append("nächster") + validFollowups.append("nächstes") + validFollowups.append("nächsten") + validFollowups.append("nächstem") + validFollowups.append("letzte") + validFollowups.append("letzter") + validFollowups.append("letztes") + validFollowups.append("letzten") + validFollowups.append("letztem") + validFollowups.append("jetzt") + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "heute" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag and wordPrev != "am" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen + dayOffset = 1 + used += 1 + elif word == "übermorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word[:3] == "tag" and len(word) <= 5: + num = is_number_de(wordPrev) + if num: + dayOffset += num + start -= 1 + used = 2 + elif word[:5] == "woche" and len(word) <= 7 and not fromFlag: + num = is_number_de(wordPrev) + if num: + dayOffset += num * 7 + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev[:5] == "letzt": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word[:5] == "monat" and len(word) <= 7 and not fromFlag: + num = is_number_de(wordPrev) + if num: + monthOffset = num + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:5] == "letzt": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word[:4] == "jahr" and len(word) <= 6 and not fromFlag: + num = is_number_de(wordPrev) + if num: + yearOffset = num + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev[:6] == "nächst": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext == "morgen": # morgen means morning if preceded by + # the day of the week + words[idx + 1] = "früh" + if wordPrev[:6] == "nächst": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev[:5] == "letzt": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + + if ( + word == "von" or word == "nach" or word == "ab") and wordNext \ + in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgen" and wordPrev != "am" and \ + wordPrev not in days: # morgen means tomorrow if not "am + # Morgen" and not [day of the week] morgen: + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext[:6] == "nächst": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext[:5] == "letzt": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1].startswith("diese"): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else "" + wordNextNextNextNextNext = words[idx + 5] if idx + 5 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word[:6] == "mittag": + hrAbs = 12 + used += 1 + elif word[:11] == "mitternacht": + hrAbs = 0 + used += 1 + elif word == "morgens" or ( + wordPrev == "am" and word == "morgen") or word == "früh": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word[:10] == "nachmittag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word[:5] == "abend": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif word[:5] == "nacht": + if not hrAbs: + hrAbs = 23 + used += 1 + elif word[:6] == "stunde" and \ + (wordPrev in markers or wordPrevPrev in markers): + factor = is_number_de(word) or 1 + minOffset = 60 * factor + if wordPrevPrev in markers: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + timeQualifier = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord in eveningQualifiers: + used += 1 + timeQualifier = "pm" + elif nextWord in timeQualifiersList: + used += 1 + timeQualifier = "am" + elif nextWord == "uhr": + used += 1 + if wordNextNext in eveningQualifiers: + used += 1 + timeQualifier = "pm" + elif wordNextNext in timeQualifiersList: + used += 1 + timeQualifier = "am" + elif strHH.isdigit(): + if int(strHH) > 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + timeQualifier = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + timeQualifier = "am" + used = 1 + else: + if wordNext[:6] == "stunde" and len(wordNext) <= 7: + # "in 3 hours" + hrOffset = is_number_de(word) or 1 + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext[:6] == "minute" and len(wordNext) <= 7: + # "in 10 minutes" + minOffset = is_number_de(word) or 1 + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext[:7] == "sekunde" and len(wordNext) <= 8: + # in 5 seconds + secOffset = is_number_de(word) or 1 + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "uhr": + strHH = word + used += 1 + isTime = True + if wordNextNext in timeQualifiersList or \ + wordNextNextNext in timeQualifiersList \ + and not is_number_de(wordNextNext): + strMM = "" + if wordNextNext[:10] == "nachmittag": + used += 1 + timeQualifier = "pm" + elif wordNextNext == "am" and wordNextNextNext == \ + "nachmittag": + used += 2 + timeQualifier = "pm" + elif wordNextNext[:6] == "mittag": + used += 1 + timeQualifier = "am" + elif wordNextNext == "am" and wordNextNextNext == \ + "mittag": + used += 2 + timeQualifier = "am" + elif wordNextNext[:5] == "abend": + used += 1 + timeQualifier = "pm" + elif wordNextNext == "am" and wordNextNextNext == \ + "abend": + used += 2 + timeQualifier = "pm" + elif wordNextNext[:7] == "morgens": + used += 1 + timeQualifier = "am" + elif wordNextNext == "am" and wordNextNextNext == \ + "morgen": + used += 2 + timeQualifier = "am" + elif wordNextNext[:5] == "nacht": + used += 1 + if 8 <= int(word) <= 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + + elif is_numeric_de(wordNextNext): + strMM = wordNextNext + used += 1 + # TTS failure "16 Uhr 30 Uhr" (common with google) + if wordNextNextNext == "uhr": + used += 1 + wordNextNextNext = wordNextNextNextNext + wordNextNextNextNext = wordNextNextNextNextNext + if wordNextNextNext in timeQualifiersList or \ + wordNextNextNextNext in timeQualifiersList: + if wordNextNextNext[:10] == "nachmittag": + used += 1 + timeQualifier = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "nachmittag": + used += 2 + timeQualifier = "pm" + elif wordNextNext[:6] == "mittag": + used += 1 + timeQualifier = "am" + elif wordNextNext == "am" and wordNextNextNext == \ + "mittag": + used += 2 + timeQualifier = "am" + elif wordNextNextNext[:5] == "abend": + used += 1 + timeQualifier = "pm" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "abend": + used += 2 + timeQualifier = "pm" + elif wordNextNextNext[:7] == "morgens": + used += 1 + timeQualifier = "am" + elif wordNextNextNext == "am" and \ + wordNextNextNextNext == "morgen": + used += 2 + timeQualifier = "am" + elif wordNextNextNext == "nachts": + used += 1 + if 8 <= int(word) <= 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + elif strHH.isdigit(): + if int(strHH) > 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + elif strHH.isdigit(): + if int(strHH) > 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + + elif wordNext in timeQualifiersList or \ + wordNextNext in timeQualifiersList: + strHH = word + strMM = 00 + isTime = True + if wordNext[:10] == "nachmittag": + used += 1 + timeQualifier = "pm" + elif wordNext == "am" and wordNextNext == "nachmittag": + used += 2 + timeQualifier = "pm" + elif wordNextNext[:6] == "mittag": + used += 1 + timeQualifier = "am" + elif wordNextNext == "am" and wordNextNextNext == \ + "mittag": + used += 2 + timeQualifier = "am" + elif wordNext[:5] == "abend": + used += 1 + timeQualifier = "pm" + elif wordNext == "am" and wordNextNext == "abend": + used += 2 + timeQualifier = "pm" + elif wordNext[:7] == "morgens": + used += 1 + timeQualifier = "am" + elif wordNext == "am" and wordNextNext == "morgen": + used += 2 + timeQualifier = "am" + elif wordNext == "nachts": + used += 1 + if 8 <= int(word) <= 12: + timeQualifier = "pm" + else: + timeQualifier = "am" + + if timeQualifier == "": + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + if timeQualifier != "": + if strHH <= 12 and timeQualifier == "pm" and not \ + (strHH == 12 and any([q in words for q in ("pm", "p.m.")])): + if strHH == 12: + strHH = 0 + dayOffset += 1 + else: + strHH += 12 + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "Uhr": + words[words.index(wordPrev)] = "" + + if wordPrev == "früh": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "spät": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + if hasYear: + temp = datetime.strptime(datestr, "%B %d %Y") + else: + temp = datetime.strptime(datestr, "%B %d") + + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "und" and words[idx - 1] == "" \ + and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_en.py b/ovos_date_parser/dates_en.py new file mode 100644 index 0000000..a3766f0 --- /dev/null +++ b/ovos_date_parser/dates_en.py @@ -0,0 +1,1190 @@ +import re +from datetime import datetime, timedelta, time + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_en import extract_number_en, _convert_words_to_numbers_en, pronounce_number_en +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local, DAYS_IN_1_YEAR, DAYS_IN_1_MONTH + + +def nice_time_en(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_en(int(string[0])) + " " + speak += pronounce_number_en(int(string[1])) + else: + speak = pronounce_number_en(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "hundred" + else: + if string[3] == '0': + speak += pronounce_number_en(0) + " " + speak += pronounce_number_en(int(string[4])) + else: + speak += pronounce_number_en(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "midnight" + elif dt.hour == 12 and dt.minute == 0: + return "noon" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "quarter past " + pronounce_number_en(hour) + elif dt.minute == 30: + speak = "half past " + pronounce_number_en(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "quarter to " + pronounce_number_en(next_hour) + else: + speak = pronounce_number_en(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + " o'clock" + else: + if dt.minute < 10: + speak += " oh" + speak += " " + pronounce_number_en(dt.minute) + + if use_ampm: + if dt.hour > 11: + speak += " p.m." + else: + speak += " a.m." + + return speak + + +def extract_duration_en(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + # NOTE: these are spelled wrong on purpose because of the loop below that strips the s + units = ['months', 'years', 'decades', 'centurys', 'millenniums'] + \ + list(time_units.keys()) + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}s?" + text = _convert_words_to_numbers_en(text) + text = text.replace("centuries", "century").replace("millenia", "millennium") + for word in ('day', 'month', 'year', 'decade', 'century', 'millennium'): + text = text.replace(f'a {word}', f'1 {word}') + + for unit_en in units: + unit_pattern = pattern.format(unit=unit_en[:-1]) # remove 's' from unit + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + def repl_non_std(match): + val = float(match.group(1)) + if unit_en == "months": + val = DAYS_IN_1_MONTH * val + if unit_en == "years": + val = DAYS_IN_1_YEAR * val + if unit_en == "decades": + val = 10 * DAYS_IN_1_YEAR * val + if unit_en == "centurys": + val = 100 * DAYS_IN_1_YEAR * val + if unit_en == "millenniums": + val = 1000 * DAYS_IN_1_YEAR * val + time_units["days"] += val + return '' + + if unit_en not in time_units: + text = re.sub(unit_pattern, repl_non_std, text) + else: + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_en(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # normalize and lowercase utt (replaces words with numbers) + s = _convert_words_to_numbers_en(s, ordinals=None) + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace(',', '') \ + .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') \ + .replace("o' clock", "o'clock").replace("o clock", "o'clock") \ + .replace("o ' clock", "o'clock").replace("o 'clock", "o'clock") \ + .replace("oclock", "o'clock").replace("couple", "2") \ + .replace("centuries", "century").replace("decades", "decade") \ + .replace("millenniums", "millennium") + + wordList = s.split() + for idx, word in enumerate(wordList): + word = word.replace("'s", "") + + ordinals = ["rd", "st", "nd", "th"] + if word[0].isdigit(): + for ordinal in ordinals: + # "second" is the only case we should not do this + if ordinal in word and "second" not in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if not anchorDate: + anchorDate = now_local() + + if text == "": + return None + default_time = default_time or time(0, 0, 0) + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + wkday = anchorDate.weekday() # 0 - monday + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['morning'] + timeQualifiersPM = ['afternoon', 'evening', 'night', 'tonight'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + year_markers = ['in', 'on', 'of'] + past_markers = ["last", "past"] + earlier_markers = ["ago", "earlier"] + later_markers = ["after", "later"] + future_markers = ["in", "within"] # in a month -> + 1 month timedelta + future_1st_markers = ["next"] # next month -> day 1 of next month + markers = year_markers + ['at', 'by', 'this', 'around', 'for', "within"] + days = ['monday', 'tuesday', 'wednesday', + 'thursday', 'friday', 'saturday', 'sunday'] + months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + recur_markers = days + [d + 's' for d in days] + ['weekend', 'weekday', + 'weekends', 'weekdays'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', + 'sept', 'oct', 'nov', 'dec'] + year_multiples = ["decade", "century", "millennium"] + day_multiples = ["weeks", "months", "years"] + past_markers = ["was", "last", "past"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + if word in earlier_markers and dayOffset: + dayOffset = - dayOffset + used += 1 + elif word == "now" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + try: + multiplier = float(word) + except: + multiplier = extract_number_en(word) + multiplier = multiplier or 1 + _leftover = "0" + if int(multiplier) != multiplier: + multiplier, _leftover = str(multiplier).split(".") + multiplier = int(multiplier) + + used += 2 + if wordNext == "decade": + yearOffset = multiplier * 10 + int(_leftover[:1]) + elif wordNext == "century": + yearOffset = multiplier * 100 + int(_leftover[:2]) * 10 + elif wordNext == "millennium": + yearOffset = multiplier * 1000 + int(_leftover[:3]) * 100 + + if wordNextNext in earlier_markers: + yearOffset = yearOffset * -1 + used += 1 + elif word in past_markers: + yearOffset = yearOffset * -1 + elif wordPrev in past_markers: + yearOffset = yearOffset * -1 + start -= 1 + used += 1 + + elif word in year_markers and wordNext.isdigit() and len(wordNext) == 4: + yearOffset = int(wordNext) - int(currentYear) + used += 2 + hasYear = True + # couple of + elif word == "2" and wordNext == "of" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "decade": + yearOffset = multiplier * 10 + elif wordNextNext == "century": + yearOffset = multiplier * 100 + elif wordNextNext == "millennium": + yearOffset = multiplier * 1000 + elif word == "2" and wordNext == "of" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 3 + if wordNextNext == "years": + yearOffset = multiplier + elif wordNextNext == "months": + monthOffset = multiplier + elif wordNextNext == "weeks": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "today" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "tomorrow" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "day" and wordNext == "before" and wordNextNext == "yesterday" and not fromFlag: + dayOffset = -2 + used += 3 + elif word == "before" and wordNext == "yesterday" and not fromFlag: + dayOffset = -2 + used += 2 + elif word == "yesterday" and not fromFlag: + dayOffset = -1 + used += 1 + elif (word == "day" and + wordNext == "after" and + wordNextNext == "tomorrow" and + not fromFlag and + (not wordPrev or not wordPrev[0].isdigit())): + dayOffset = 2 + used = 3 + if wordPrev == "the": + start -= 1 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "day" and wordNext not in earlier_markers: + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + if wordPrevPrev in past_markers: + dayOffset = dayOffset * -1 + start -= 1 + used += 1 + + # next day + # normalize step makes "in a day" -> "in day" + elif wordPrev and wordPrev in future_markers + future_1st_markers: + dayOffset += 1 + start -= 1 + used = 2 + elif wordPrev in past_markers: + dayOffset = -1 + start -= 1 + used = 2 + # parse X days ago + elif word == "day" and wordNext in earlier_markers: + if wordPrev and wordPrev[0].isdigit(): + dayOffset -= int(wordPrev) + start -= 1 + used = 3 + else: + dayOffset -= 1 + used = 2 + # parse last/past/next week and in/after X weeks + elif word == "week" and not fromFlag and wordPrev and wordNext not in earlier_markers: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + if wordPrevPrev in past_markers: + dayOffset = dayOffset * -1 + start -= 1 + used += 1 + # next week -> next monday + elif wordPrev in future_1st_markers: + dayOffset = 7 - wkday + start -= 1 + used = 2 + # normalize step makes "in a week" -> "in week" + elif wordPrev in future_markers: + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev in past_markers: + dayOffset = -7 + start -= 1 + used = 2 + # parse X weeks ago + elif word == "week" and not fromFlag and wordNext in earlier_markers: + if wordPrev[0].isdigit(): + dayOffset -= int(wordPrev) * 7 + start -= 1 + used = 3 + else: + dayOffset -= 7 + used = 2 + # parse last/past/next weekend and in/after X weekends + elif word == "weekend" and not fromFlag and wordPrev and wordNext not in earlier_markers: + # in/after X weekends + if wordPrev[0].isdigit(): + n = int(wordPrev) + dayOffset += 7 - wkday # next monday -> 1 weekend + n -= 1 + dayOffset += n * 7 + start -= 1 + used = 2 + if wordPrevPrev in past_markers: + dayOffset = dayOffset * -1 + start -= 1 + used += 1 + # next weekend -> next saturday + elif wordPrev in future_1st_markers: + if wkday < 5: + dayOffset = 5 - wkday + elif wkday == 5: + dayOffset = 7 + else: + dayOffset = 6 + start -= 1 + used = 2 + # normalize step makes "in a weekend" -> "in weekend" (next monday) + elif wordPrev in future_markers: + dayOffset += 7 - wkday # next monday + start -= 1 + used = 2 + # last/past weekend -> last/past saturday + elif wordPrev in past_markers: + dayOffset -= wkday + 2 + start -= 1 + used = 2 + # parse X weekends ago + elif word == "weekend" and not fromFlag and wordNext in earlier_markers: + dayOffset -= wkday + 3 # past friday "one weekend ago" + used = 2 + # X weekends ago + if wordPrev and wordPrev[0].isdigit(): + n = int(wordPrev) - 1 + dayOffset -= n * 7 + start -= 1 + used = 3 + # parse 10 months, next month, last month + elif word == "month" and not fromFlag and wordPrev and wordNext not in earlier_markers: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + if wordPrevPrev in past_markers: + monthOffset = monthOffset * -1 + start -= 1 + used += 1 + # next month -> day 1 + elif wordPrev in future_1st_markers: + next_dt = (anchorDate.replace(day=1) + timedelta(days=32)).replace(day=1) + dayOffset = (next_dt - anchorDate).days + start -= 1 + used = 2 + # normalize step makes "in a month" -> "in month" + elif wordPrev in future_markers: + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev in past_markers: + monthOffset = -1 + start -= 1 + used = 2 + elif word == "month" and wordNext in earlier_markers: + if wordPrev and wordPrev[0].isdigit(): + monthOffset -= int(wordPrev) + start -= 1 + used = 3 + else: + monthOffset -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "year" and not fromFlag and wordPrev and wordNext not in earlier_markers: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + if wordPrevPrev in past_markers: + yearOffset = yearOffset * -1 + start -= 1 + used += 1 + # next year -> day 1 + elif wordPrev in future_1st_markers: + next_dt = anchorDate.replace(day=1, month=1, year=anchorDate.year + 1) + dayOffset = (next_dt - anchorDate).days + start -= 1 + used = 2 + # normalize step makes "in a year" -> "in year" + elif wordPrev in future_markers: + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev in past_markers: + yearOffset = -1 + start -= 1 + used = 2 + elif word == "year" and wordNext in earlier_markers: + if wordPrev and wordPrev[0].isdigit(): + yearOffset -= int(wordPrev) + start -= 1 + used = 3 + else: + yearOffset -= 1 + used = 2 + + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "next": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev in past_markers: + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # if no date indicators found, it may not be the month of May + # may "i/we" ... + # "... may be" + elif word == 'may' and wordNext in ['i', 'we', 'be']: + datestr = "" + # when was MONTH + elif not hasYear and wordPrev in past_markers: + if anchorDate.month > m: + datestr += f" {anchorDate.year}" + else: + datestr += f" {anchorDate.year - 1}" + hasYear = True + # when is MONTH + elif not hasYear: + if anchorDate.month > m: + datestr += f" {anchorDate.year + 1}" + else: + datestr += f" {anchorDate.year}" + hasYear = True + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("today") + validFollowups.append("tomorrow") + validFollowups.append("yesterday") + validFollowups.append("next") + validFollowups.append("last") + validFollowups.append("past") + validFollowups.append("now") + validFollowups.append("this") + if (word == "from" or word == "after") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "tomorrow": + dayOffset += 1 + elif wordNext == "yesterday": + dayOffset -= 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext in future_1st_markers: + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext in past_markers: + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "this": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "noon": + hrAbs = 12 + used += 1 + elif word == "midnight": + hrAbs = 0 + used += 1 + elif word == "morning": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "afternoon": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "evening": + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "tonight" or word == "night": + if hrAbs is None: + hrAbs = 22 + # used += 1 ## NOTE this breaks other tests, TODO refactor me! + + # couple of time_unit + elif word == "2" and wordNext == "of" and \ + wordNextNext in ["hours", "minutes", "seconds"]: + used += 3 + if wordNextNext == "hours": + hrOffset = 2 + elif wordNextNext == "minutes": + minOffset = 2 + elif wordNextNext == "seconds": + secOffset = 2 + # parse in a/next second/minute/hour + elif wordNext == "hour" and word in future_markers + future_1st_markers: + used += 2 + hrOffset = 1 + elif wordNext == "minute" and word in future_markers + future_1st_markers: + used += 2 + minOffset = 1 + elif wordNext == "second" and word in future_markers + future_1st_markers: + used += 2 + secOffset = 1 + # parse last/past second/minute/hour + elif wordNext == "hour" and word in past_markers: + used += 2 + hrOffset = - 1 + elif wordNext == "minute" and word in past_markers: + used += 2 + minOffset = - 1 + elif wordNext == "second" and word in past_markers: + used += 2 + secOffset = - 1 + # parse half an hour, quarter hour + elif word == "hour" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "half": + minOffset = 30 + elif wordPrev == "quarter": + minOffset = 15 + elif wordPrevPrev == "quarter": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + words[idx - 2] = "" + elif wordPrev == "within": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "this": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "tonight" or wordNextNext == "tonight" or \ + wordPrev == "tonight" or wordPrevPrev == "tonight" or \ + wordNextNextNext == "tonight": + remainder = "pm" + used += 1 + if wordPrev == "tonight": + words[idx - 1] = "" + if wordPrevPrev == "tonight": + words[idx - 2] = "" + if wordNextNext == "tonight": + used += 1 + if wordNextNextNext == "tonight": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "morning": + remainder = "am" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "afternoon": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "evening": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "morning": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "afternoon": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "evening": + remainder = "pm" + used += 2 + elif wordNext == "this" and wordNextNext == "morning": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "this" and wordNextNext == "afternoon": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "this" and wordNextNext == "evening": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "at" and wordNextNext == "night": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + int(strNum) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hours": + used += 1 + elif ( + (wordNext == "hours" or wordNext == "hour" or + remainder == "hours" or remainder == "hour") and + word[0] != '0' and + (int(strNum) < 100 or int(strNum) > 2400 or wordPrev in past_markers)): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + # in last N hours + if wordPrev in past_markers: + start -= 1 + used += 1 + hrOffset = hrOffset * -1 + + elif wordNext == "minutes" or wordNext == "minute" or \ + remainder == "minutes" or remainder == "minute": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + # in last N minutes + if wordPrev in past_markers: + start -= 1 + used += 1 + minOffset = minOffset * -1 + elif wordNext == "seconds" or wordNext == "second" \ + or remainder == "seconds" or remainder == "second": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + # in last N seconds + if wordPrev in past_markers: + start -= 1 + used += 1 + secOffset = secOffset * -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "hours" or wordNext == "hour" or \ + remainder == "hours" or remainder == "hour": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "hours" or + wordNextNext == "hour" or + remainder == "hours" or remainder == "hour"): + used += 1 + elif (wordNext == "" + or wordNext == "o'clock" + or (wordNext == "in" and (wordNextNext == "the" or wordNextNext == timeQualifier)) + or wordNext == 'tonight' + or wordNextNext == 'tonight'): + strHH = strNum + strMM = "00" + if wordNext == "o'clock": + used += 1 + + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm', 'hours', 'minutes', + "second", "seconds", + "hour", "minute"] and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "o" or wordPrev == "oh": + words[words.index(wordPrev)] = "" + + if wordPrev == "early": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "late": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "this": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "this": + daySpecified = True + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + try: + temp = datetime.strptime(datestr, "%B %d %Y") + except ValueError: + # Try again, without day + try: + temp = datetime.strptime(datestr, "%B %Y") + except ValueError: + # Try again, with only month + temp = datetime.strptime(datestr, "%B") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=default_time.hour, + minute=default_time.minute, + second=default_time.second) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + if hrAbs != -1 and minAbs != -1 and not hrOffset and not minOffset and not secOffset: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate.replace(hour=hrAbs, + minute=minAbs) + + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + + for idx, word in enumerate(words): + if words[idx] == "and" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_es.py b/ovos_date_parser/dates_es.py new file mode 100644 index 0000000..76d1227 --- /dev/null +++ b/ovos_date_parser/dates_es.py @@ -0,0 +1,991 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_es import pronounce_number_es +from ovos_number_parser.util import tokenize +from ovos_utils.time import now_local, DAYS_IN_1_YEAR, DAYS_IN_1_MONTH + + +def nice_time_es(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'cinco treinta' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # Tenemos que tener en cuenta que cuando hablamos en formato + # 24h, no hay que especificar ninguna precisión adicional + # como "la noche", "la tarde" o "la mañana" + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + if dt.hour == 1: + speak += "la una" + else: + speak += "las " + pronounce_number_es(dt.hour) + + # las 14:04 son "las catorce cero cuatro" + if dt.minute < 10: + speak += " cero " + pronounce_number_es(dt.minute) + else: + speak += " " + pronounce_number_es(dt.minute) + + else: + # Prepare for "tres menos cuarto" ?? + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0 or hour == 12: + speak += "las doce" + elif hour == 1 or hour == 13: + speak += "la una" + elif hour < 13: + speak = "las " + pronounce_number_es(hour) + else: + speak = "las " + pronounce_number_es(hour - 12) + + if minute != 0: + # las horas especiales + if minute == 15: + speak += " y cuarto" + elif minute == 30: + speak += " y media" + elif minute == -15: + speak += " menos cuarto" + else: # seis y nueve. siete y veinticinco + if minute > 0: + speak += " y " + pronounce_number_es(minute) + else: # si son las siete menos veinte, no ponemos la "y" + speak += " " + pronounce_number_es(minute) + + # si no especificamos de la tarde, noche, mañana, etc + if minute == 0 and not use_ampm: + # 3:00 + speak += " en punto" + + if use_ampm: + # "de la noche" es desde que anochece hasta medianoche + # así que decir que es desde las 21h es algo subjetivo + # en España a las 20h se dice "de la tarde" + # en castellano, las 12h es de la mañana o mediodía + # así que diremos "de la tarde" a partir de las 13h. + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + if hour >= 0 and hour < 6: + speak += " de la madrugada" + elif hour >= 6 and hour < 13: + speak += " de la mañana" + elif hour >= 13 and hour < 21: + speak += " de la tarde" + else: + speak += " de la noche" + return speak + + +def extract_datetime_es(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["entre", "la", "del", "al", "el", "de", + "para", "una", "cualquier", "a", + "e'", "esta", "este"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "á", + "a").replace( + "é", + "e").replace( + "ó", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonyms and equivalents, "tomorrow early = tomorrow morning + synonyms = {"mañana": ["amanecer", "temprano", "muy temprano"], + "tarde": ["media tarde", "atardecer"], + "noche": ["anochecer", "tarde"]} + for syn in synonyms: + for word in synonyms[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["mañanas", "tardes", "noches", "días", "semanas", + "años", "minutos", "segundos", "las", "los", "siguientes", + "próximas", "próximos", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + if anchorDate is None: + anchorDate = now_local() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['mañana', 'tarde', 'noche'] + time_indicators = ["en", "la", "al", "por", "pasados", + "pasadas", "día", "hora"] + days = ['lunes', 'martes', 'miércoles', + 'jueves', 'viernes', 'sábado', 'domingo'] + months = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', + 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', + 'diciembre'] + monthsShort = ['ene', 'feb', 'mar', 'abr', 'may', 'jun', 'jul', 'ago', + 'sep', 'oct', 'nov', 'dic'] + nexts = ["siguiente", "próximo", "próxima"] + suffix_nexts = ["siguientes", "subsecuentes"] + lasts = ["último", "última"] + suffix_lasts = ["pasada", "pasado", "anterior", "antes"] + nxts = ["después", "siguiente", "próximo", "próxima"] + prevs = ["antes", "previa", "previo", "anterior"] + froms = ["desde", "en", "para", "después de", "por", "próximo", + "próxima", "de"] + thises = ["este", "esta"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoy" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "mañana" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ayer" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteayer" or + (word == "ante" and wordNext == "ayer")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ayer": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ayer" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "ante anteayer" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "pasado" and wordNext == "mañana" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "ante" and wordNext == "ayer" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "día": + if wordNext == "pasado" or wordNext == "ante": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "año" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "siguiente": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "pasado": + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "siguiente": + # dayOffset += 7 + used += 1 + elif wordNext == "pasado": + # dayOffset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 mayo + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # mayo 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia mayo + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # mayo dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoy") + validFollowups.append("mañana") + validFollowups.append("ayer") + validFollowups.append("anteayer") + validFollowups.append("ahora") + validFollowups.append("ya") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "mañana" and wordNext == "ayer") and not ( + word == "pasado" or word == "antes"): + used = 2 + fromFlag = True + if wordNext == "mañana" and word != "pasado": + dayOffset += 1 + elif wordNext == "ayer": + dayOffset -= 1 + elif wordNext == "anteayer": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ayer": + dayOffset -= 2 + elif (wordNext == "ante" and wordNext == "ante" and + wordNextNextNext == "ayer"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + # if wordNextNext == "feira": + # used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + # if wordNextNextNext == "feira": + # used += 1 + if wordNext in months: + used -= 1 + if used > 0: + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "medio" and wordNext == "día": + hrAbs = 12 + used += 2 + elif word == "media" and wordNext == "noche": + hrAbs = 0 + used += 2 + elif word == "mañana": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "media" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "tarde" and wordNext == "noche": + if not hrAbs: + hrAbs = 20 + used += 2 + elif word == "media" and wordNext == "mañana": + if not hrAbs: + hrAbs = 10 + used += 2 + # elif word == "fim" and wordNext == "tarde": + # if not hrAbs: + # hrAbs = 19 + # used += 2 + # elif word == "fim" and wordNext == "manha": + # if not hrAbs: + # hrAbs = 11 + # used += 2 + elif word == "madrugada": + if not hrAbs: + hrAbs = 1 + used += 2 + elif word == "noche": + if not hrAbs: + hrAbs = 21 + used += 1 + # parse half an hour, quarter hour + elif (word == "hora" and + (wordPrev in time_indicators or wordPrevPrev in + time_indicators)): + if wordPrev == "media": + minOffset = 30 + elif wordPrev == "cuarto": + minOffset = 15 + elif wordPrevPrev == "cuarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "mañana" or wordNext == "madrugada": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noche": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "mañana": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noche": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "mañana" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "mañana"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + # wordPrev == "o" or + # wordPrev == "oh" or + wordPrev == "cero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "en" and wordNextNext == "punto"): + strHH = word + strMM = 00 + if wordNext == "en" and wordNextNext == "punto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "mañana": + remainder = "am" + used += 1 + elif wordNextNextNext == "noche": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "en" or wordPrev == "punto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + # resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def extract_duration_es(text): + """ + Convert an spanish phrase into a number of seconds + Convert things like: + "10 Minutos" + "3 dias 8 horas 10 Minutos e 49 Segundos" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + text = text.lower().replace("í", "i") + time_units = { + 'microseconds': 'microsegundos', + 'milliseconds': 'milisegundos', + 'seconds': 'segundos', + 'minutes': 'minutos', + 'hours': 'horas', + 'days': 'dias', + 'weeks': 'semanas' + } + # NOTE: some of these english units are spelled wrong on purpose because of the loop below that strips the s + non_std_un = { + "months": "mes", + "years": "anos", + 'decades': "decadas", + 'centurys': "siglos", + 'millenniums': "milenios" + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[s]?" + + text = text.replace("í", "i").replace("é", "e").replace("ñ", "n").replace("meses", "mes") + text = numbers_to_digits_es(text) + + for (unit_en, unit_es) in time_units.items(): + unit_pattern = pattern.format( + unit=unit_es[:-1]) # remove 's' from unit + time_units[unit_en] = 0 + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + for (unit_en, unit_es) in non_std_un.items(): + unit_pattern = pattern.format( + unit=unit_es[:-1]) # remove 's' from unit + + def repl_non_std(match): + val = float(match.group(1)) + if unit_en == "months": + val = DAYS_IN_1_MONTH * val + if unit_en == "years": + val = DAYS_IN_1_YEAR * val + if unit_en == "decades": + val = 10 * DAYS_IN_1_YEAR * val + if unit_en == "centurys": + val = 100 * DAYS_IN_1_YEAR * val + if unit_en == "millenniums": + val = 1000 * DAYS_IN_1_YEAR * val + time_units["days"] += val + return '' + + text = re.sub(unit_pattern, repl_non_std, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def numbers_to_digits_es(utterance: str) -> str: + """ + Replace written numbers in a Spanish text with their digit equivalents. + + Args: + utterance (str): Input string possibly containing written numbers. + + Returns: + str: Text with written numbers replaced by digits. + """ # TODO - standardize in ovos-number-parser + number_replacements = { + "uno": "1", "dos": "2", "tres": "3", "cuatro": "4", + "cinco": "5", "seis": "6", "siete": "7", "ocho": "8", "nueve": "9", + "diez": "10", "once": "11", "doce": "12", "trece": "13", "catorce": "14", + "quince": "15", "dieciséis": "16", "diecisiete": "17", "dieciocho": "18", + "diecinueve": "19", "veinte": "20" + # Extend this dictionary for higher numbers as needed + } + words = tokenize(utterance) + for idx, word in enumerate(words): + if word in number_replacements: + words[idx] = number_replacements[word] + return " ".join(words) diff --git a/ovos_date_parser/dates_eu.py b/ovos_date_parser/dates_eu.py new file mode 100644 index 0000000..601b28c --- /dev/null +++ b/ovos_date_parser/dates_eu.py @@ -0,0 +1,927 @@ +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_eu import pronounce_number_eu +from ovos_utils.time import now_local, to_local + +HOUR_STRING_EU = { + 1: 'ordubata', + 2: 'ordubiak', + 3: 'hirurak', + 4: 'laurak', + 5: 'bostak', + 6: 'seirak', + 7: 'zazpirak', + 8: 'zortzirak', + 9: 'bederatziak', + 10: 'hamarrak', + 11: 'hamaikak', + 12: 'hamabiak' +} + + +def nice_time_eu(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'cinco treinta' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # Tenemos que tener en cuenta que cuando hablamos en formato + # 24h, no hay que especificar ninguna precisión adicional + # como "la noche", "la tarde" o "la mañana" + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + speak += pronounce_number_eu(dt.hour) + 'ak' + + # las 14:04 son "las catorce cero cuatro" + + if dt.minute < 10: + speak += " zero " + pronounce_number_eu(dt.minute) + else: + speak += " " + pronounce_number_eu(dt.minute) + + else: + minute = dt.minute + hour = dt.hour + + _hour = hour + if _hour == 0: + _hour = 12 + if _hour > 12: + _hour -= 12 + + if (minute > 30): + _hour += 1 + + speak = HOUR_STRING_EU[_hour] + + if minute != 0: + if minute <= 30: + if minute == 15: + speak += " eta laurden" + elif minute == 30: + speak += " eta erdi" + else: + speak += " eta " + pronounce_number_eu(minute) + else: + if minute == 45: + speak += " laurden gutxi" + else: + speak += " " + pronounce_number_eu(60 - minute) + " gutxi" + + # si no especificamos de la tarde, noche, mañana, etc + if minute == 0 and not use_ampm: + # 3:00 + speak += " puntuan" + + if use_ampm: + # "de la noche" es desde que anochece hasta medianoche + # así que decir que es desde las 21h es algo subjetivo + # en España a las 20h se dice "de la tarde" + # en castellano, las 12h es de la mañana o mediodía + # así que diremos "de la tarde" a partir de las 13h. + # http://lema.rae.es/dpd/srv/search?id=YNoTWNJnAD6bhhVBf9 + if hour >= 6 and hour < 13: + speak = "goizeko " + speak + elif hour >= 13 and hour < 20: + speak = "arratsaldeko " + speak + else: + speak = "gaueko " + speak + return speak + # hemen dago tranpa + # return str(dt.hour) + ":" + str(dt.minute) + + +def nice_relative_time_eu(when, relative_to=None): + """Create a relative phrase to roughly describe a datetime + + Examples are "25 seconds", "tomorrow", "7 days". + + Args: + when (datetime): Local timezone + relative_to (datetime): Baseline for relative time, default is now() + lang (str, optional): Defaults to "en-us". + Returns: + str: Relative description of the given time + """ + if relative_to: + now = relative_to + else: + now = now_local() + delta = to_local(when) - now + + if delta.total_seconds() < 1: + return "0 segundo" + + if delta.total_seconds() < 90: + if delta.total_seconds() == 1: + return "segundo bat" + else: + return "{} segundo".format(int(delta.total_seconds())) + + minutes = int((delta.total_seconds() + 30) // 60) # +30 to round minutes + if minutes < 90: + if minutes == 1: + return "minutu bat" + else: + return "{} minutu".format(minutes) + + hours = int((minutes + 30) // 60) # +30 to round hours + if hours < 36: + if hours == 1: + return "ordu bat" + else: + return "{} ordu".format(hours) + + # TODO: "2 weeks", "3 months", "4 years", etc + days = int((hours + 12) // 24) # +12 to round days + if days == 1: + return "egun bat" + else: + return "{} egun".format(days) + + +def extract_datetime_eu(input_str, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "."] + # noise_words = ["entre", "la", "del", "al", "el", "de", + # "para", "una", "cualquier", "a", + # "e'", "esta", "este"] + # TODO + noise_words = ["artean", "tartean", "edozein", "hau", "hontan", "honetan", + "para", "una", "cualquier", "a", + "e'", "esta", "este"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "-", + " ").replace( + "_", + "") + # handle synonyms and equivalents, "tomorrow early = tomorrow morning + synonyms = {"goiza": ["egunsentia", "goiz", "oso goiz"], + "arratsaldea": ["arratsa", "bazkalostea", "arratsalde", "arrats"], + "gaua": ["iluntzea", "berandu", "gau", "gaba"]} + for syn in synonyms: + for word in synonyms[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals + wordlist = ["goizak", "arratsaldeak", "gauak", "egunak", "asteak", + "urteak", "minutuak", "segunduak", "hurrengoak", + "datozenak", "orduak", "hilabeteak"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('ak')) + # s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if input_str == "": + return None + if anchorDate is None: + anchorDate = datetime.now() + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(input_str).split(" ") + timeQualifiersList = ['goiza', 'arratsaldea', 'gaua'] + time_indicators = ["en", "la", "al", "por", "pasados", + "pasadas", "día", "hora"] + days = ['astelehena', 'asteartea', 'asteazkena', + 'osteguna', 'ostirala', 'larunbata', 'igandea'] + months = ['urtarrila', 'otsaila', 'martxoa', 'apirila', 'maiatza', 'ekaina', + 'uztaila', 'abuztua', 'iraila', 'urria', 'azaroa', + 'abendua'] + monthsShort = ['urt', 'ots', 'mar', 'api', 'mai', 'eka', 'uzt', 'abu', + 'ira', 'urr', 'aza', 'abe'] + nexts = ["hurrengo", "datorren", "ondorengo"] + suffix_nexts = ["barru"] + lasts = ["azken", "duela"] + suffix_lasts = ["aurreko"] + nxts = ["ondorengo", "hurrengo", "datorren"] + prevs = ["aurreko", "duela", "previo", "anterior"] + # TODO + froms = ["desde", "en", "para", "después de", "por", "próximo", + "próxima", "de"] + thises = ["hau"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif (word == "gaur" or word == "gaurko") and not fromFlag: + dayOffset = 0 + used += 1 + elif (word == "bihar" or word == "biharko") and not fromFlag: + dayOffset = 1 + used += 1 + elif (word == "atzo" or word == "atzoko") and not fromFlag: + dayOffset -= 1 + used += 1 + # before yesterday + elif (word == "herenegun" or word == "herenegungo") and not fromFlag: + dayOffset -= 2 + used += 1 + # if wordNext == "ayer": + # used += 1 + # elif word == "ante" and wordNext == "ante" and wordNextNext == \ + # "ayer" and not fromFlag: + # dayOffset -= 3 + # used += 3 + # elif word == "ante anteayer" and not fromFlag: + # dayOffset -= 3 + # used += 1 + # day after tomorrow + elif (word == "etzi" or word == "etziko") and not fromFlag: + dayOffset += 2 + used = 1 + elif (word == "etzidamu" or word == "etzidamuko") and not fromFlag: + dayOffset += 3 + used = 1 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "egun" or word == "eguna" or word == "eguneko": + if wordPrevPrev and wordPrevPrev == "duela": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset -= int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "aste" or word == "astea" or word == "asteko" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "hilabete" or word == "hilabetea" or word == "hilabeteko" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "urte" or word == "urtea" or word == "urteko" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 1 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -1 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 1 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "hurrengo": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "aurreko": + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "hurrengo": + # dayOffset += 7 + used += 1 + elif wordNext == "aurreko": + # dayOffset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 mayo + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # mayo 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia mayo + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # mayo dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("gaur") + validFollowups.append("bihar") + validFollowups.append("atzo") + # validFollowups.append("atzoko") + validFollowups.append("herenegun") + validFollowups.append("orain") + validFollowups.append("oraintxe") + # validFollowups.append("ante") + + # TODO + if word in froms and wordNext in validFollowups: + + if not (word == "bihar" or word == "herenegun" or word == "atzo"): + used = 1 + fromFlag = True + if wordNext == "bihar": + dayOffset += 1 + elif wordNext == "atzo" or wordNext == "atzoko": + dayOffset -= 1 + elif wordNext == "herenegun": + dayOffset -= 2 + # elif (wordNext == "ante" and wordNext == "ante" and + # wordNextNextNext == "ayer"): + # dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + # if wordNextNext == "feira": + # used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + # if wordNextNextNext == "feira": + # used += 1 + if wordNext in months: + used -= 1 + if used > 0: + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "eguerdi" or word == "eguerdia" or word == "eguerdian": + hrAbs = 12 + used += 2 + elif word == "gauerdi" or word == "gauerdia" or word == "gauerdian": + hrAbs = 0 + used += 2 + elif word == "goiza": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "arratsaldea" or word == "arratsa" or word == "arratsean" or word == "arratsaldean": + if not hrAbs: + hrAbs = 15 + used += 1 + # TODO + # elif word == "media" and wordNext == "tarde": + # if not hrAbs: + # hrAbs = 17 + # used += 2 + elif word == "iluntze" or word == "iluntzea" or word == "iluntzean": + if not hrAbs: + hrAbs = 20 + used += 2 + # TODO + # elif word == "media" and wordNext == "mañana": + # if not hrAbs: + # hrAbs = 10 + # used += 2 + # elif word == "fim" and wordNext == "tarde": + # if not hrAbs: + # hrAbs = 19 + # used += 2 + elif word == "egunsentia" or word == "egunsentian" or word == "egunsenti": + if not hrAbs: + hrAbs = 6 + used += 1 + # elif word == "madrugada": + # if not hrAbs: + # hrAbs = 1 + # used += 2 + elif word == "gaua" or word == "gauean" or word == "gau": + if not hrAbs: + hrAbs = 21 + used += 1 + # parse half an hour, quarter hour + # TODO + elif (word == "hora" and + (wordPrev in time_indicators or wordPrevPrev in + time_indicators)): + if wordPrev == "media": + minOffset = 30 + elif wordPrev == "cuarto": + minOffset = 15 + elif wordPrevPrev == "cuarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "goiza" or wordNext == "egunsentia" or wordNext == "goizeko" or wordNext == "egunsentiko": + remainder = "am" + used += 1 + elif wordPrev == "arratsaldeko" or wordPrev == "arratsaldea" or wordPrev == "arratsaldean": + remainder = "pm" + used += 1 + elif wordNext == "gaua" or wordNext == "gauean" or wordNext == "gaueko": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and ( + wordNextNext == "goiza" or wordNextNext == "goizean" or wordNextNext == "goizeko"): + remainder = "am" + used = 2 + elif wordNext in thises and \ + ( + wordNextNext == "arratsaldea" or wordNextNext == "arratsaldean" or wordNextNext == "arratsaldeko"): + remainder = "pm" + used = 2 + elif wordNext in thises and ( + wordNextNext == "gaua" or wordNextNext == "gauean" or wordNextNext == "gaueko"): + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "goiza" or + timeQualifier == "arratsaldea"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordPrev == "arratsaldeko"): + strHH = strNum + remainder = "pm" + used = 0 + elif (wordNext == "am" or + wordNext == "a.m." or + wordPrev == "goizeko"): + strHH = strNum + remainder = "am" + used = 0 + elif (int(word) > 100 and + ( + # wordPrev == "o" or + # wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "orduak": + used += 1 + elif ( + wordNext == "orduak" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minutu": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundu": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "ordu": + used += 1 + + elif wordNext == "" or ( + wordNext == "puntuan"): + strHH = word + strMM = 00 + if wordNext == "puntuan": + used += 2 + if wordNextNextNext == "arratsaldea": + remainder = "pm" + used += 1 + elif wordNextNextNext == "goiza": + remainder = "am" + used += 1 + elif wordNextNextNext == "gaua": + if 0 > strHH > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "orduak": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "puntuan": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + temp = temp.replace(tzinfo=None) + if not hasYear: + temp = temp.replace(year=extractedDate.year, tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + # resultStr = pt_pruning(resultStr) + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_fa.py b/ovos_date_parser/dates_fa.py new file mode 100644 index 0000000..c59f88b --- /dev/null +++ b/ovos_date_parser/dates_fa.py @@ -0,0 +1,280 @@ +from datetime import datetime, timedelta + +from ovos_number_parser.numbers_fa import pronounce_number_fa, _parse_sentence +from ovos_utils.time import now_local + +_time_units = { + 'ثانیه': timedelta(seconds=1), + 'دقیقه': timedelta(minutes=1), + 'ساعت': timedelta(hours=1), +} + +_date_units = { + 'روز': timedelta(days=1), + 'هفته': timedelta(weeks=1), +} + + +def extract_duration_fa(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + remainder = [] + ar = _parse_sentence(text) + current_number = None + result = timedelta(0) + for x in ar: + if x == "و": + continue + elif type(x) == tuple: + current_number = x + elif x in _time_units: + result += _time_units[x] * current_number[0] + current_number = None + elif x in _date_units: + result += _date_units[x] * current_number[0] + current_number = None + else: + if current_number: + remainder.extend(current_number[1]) + remainder.append(x) + current_number = None + return (result, " ".join(remainder)) + + +def extract_datetime_fa(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + if text == "": + return None + text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ + .replace('?', '').replace("پس فردا", "پسفردا") \ + .replace('یک شنبه', 'یکشنبه') \ + .replace('دو شنبه', 'دوشنبه') \ + .replace('سه شنبه', 'سهشنبه') \ + .replace('چهار شنبه', 'چهارشنبه') \ + .replace('پنج شنبه', 'پنجشنبه') \ + .replace('بعد از ظهر', 'بعدازظهر') + + if not anchorDate: + anchorDate = now_local() + today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) + today_weekday = int(anchorDate.strftime("%w")) + weekday_names = [ + 'دوشنبه', + 'سهشنبه', + 'چهارشنبه', + 'پنجشنبه', + 'جمعه', + 'شنبه', + 'یکشنبه', + ] + daysDict = { + 'پریروز': today + timedelta(days=-2), + 'دیروز': today + timedelta(days=-1), + 'امروز': today, + 'فردا': today + timedelta(days=1), + 'پسفردا': today + timedelta(days=2), + } + timesDict = { + 'صبح': timedelta(hours=8), + 'بعدازظهر': timedelta(hours=15), + } + exactDict = { + 'الان': anchorDate, + } + nextWords = ["بعد", "دیگه"] + prevWords = ["پیش", "قبل"] + ar = _parse_sentence(text) + mode = 'none' + number_seen = None + delta_seen = timedelta(0) + remainder = [] + result = None + for x in ar: + handled = 1 + if mode == 'finished': + remainder.append(x) + elif x == 'و' and mode[:5] == 'delta': + pass + elif type(x) == tuple: + number_seen = x + elif x in weekday_names: + dayOffset = (weekday_names.index(x) + 1) - today_weekday + if dayOffset < 0: + dayOffset += 7 + result = today + timedelta(days=dayOffset) + mode = 'time' + elif x in exactDict: + result = exactDict[x] + mode = 'finished' + elif x in daysDict: + result = daysDict[x] + mode = 'time' + elif x in timesDict and mode == 'time': + result += timesDict[x] + mode = 'finish' + elif x in _date_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _date_units[x] * k + if mode != 'delta_time': + mode = 'delta_date' + elif x in _time_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _time_units[x] * k + mode = 'delta_time' + elif x in nextWords or x in prevWords: + # Give up instead of incorrect result + if mode == 'time': + return None + sign = 1 if x in nextWords else -1 + if mode == 'delta_date': + result = today + delta_seen + mode = 'time' + elif mode == 'delta_time': + result = anchorDate + delta_seen + mode = 'finished' + else: + handled = 0 + else: + handled = 0 + if handled == 1: + continue + if number_seen: + remainder.extend(number_seen[1]) + number_seen = None + remainder.append(x) + return (result, " ".join(remainder)) + + +def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_fa(int(string[1])) + else: + speak = pronounce_number_fa(int(string[0:2])) + if not string[3:5] == '00': + speak += " و " + if string[3] == '0': + speak += pronounce_number_fa(int(string[4])) + else: + speak += pronounce_number_fa(int(string[3:5])) + speak += ' دقیقه' + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "نیمه شب" + elif dt.hour == 12 and dt.minute == 0: + return "ظهر" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_number_fa(hour) + " و ربع" + elif dt.minute == 30: + speak = pronounce_number_fa(hour) + " و نیم" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "یه ربع به " + pronounce_number_fa(next_hour) + else: + speak = pronounce_number_fa(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + else: + speak += " و " + pronounce_number_fa(dt.minute) + ' دقیقه' + + if use_ampm: + if dt.hour > 11: + speak += " بعد از ظهر" + else: + speak += " قبل از ظهر" + + return speak diff --git a/ovos_date_parser/dates_fr.py b/ovos_date_parser/dates_fr.py new file mode 100644 index 0000000..2788498 --- /dev/null +++ b/ovos_date_parser/dates_fr.py @@ -0,0 +1,674 @@ +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_fr import _number_ordinal_fr, pronounce_number_fr, _get_ordinal_fr, \ + _number_parse_fr +from ovos_utils.time import now_local + +_ARTICLES_FR = ["le", "la", "du", "de", "les", "des"] + + +def nice_time_fr(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'cinq heures trente' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + + # "13 heures trente" + if dt.hour == 0: + speak += "minuit" + elif dt.hour == 12: + speak += "midi" + elif dt.hour == 1: + speak += "une heure" + else: + speak += pronounce_number_fr(dt.hour) + " heures" + + if dt.minute != 0: + speak += " " + pronounce_number_fr(dt.minute) + + else: + # Prepare for "trois heures moins le quart" + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0: + speak += "minuit" + elif hour == 12: + speak += "midi" + elif hour == 1 or hour == 13: + speak += "une heure" + elif hour < 13: + speak = pronounce_number_fr(hour) + " heures" + else: + speak = pronounce_number_fr(hour - 12) + " heures" + + if minute != 0: + if minute == 15: + speak += " et quart" + elif minute == 30: + speak += " et demi" + elif minute == -15: + speak += " moins le quart" + else: + speak += " " + pronounce_number_fr(minute) + + if use_ampm: + if hour > 17: + speak += " du soir" + elif hour > 12: + speak += " de l'après-midi" + elif hour > 0 and hour < 12: + speak += " du matin" + + return speak + + +def extract_datetime_fr(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + """ + s = normalize_fr(s, True) + wordList = s.split() + for idx, word in enumerate(wordList): + # remove comma and dot if it's not a number + if word[-1] in [",", "."]: + word = word[:-1] + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or dayOffset or + (isTime and (hrAbs or minAbs)) or + hrOffset != 0 or minOffset != 0 or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ["matin", "après-midi", "soir", "nuit"] + words_in = ["dans", "après"] + markers = ["à", "dès", "autour", "vers", "environs", "ce", + "cette"] + words_in + days = ["lundi", "mardi", "mercredi", + "jeudi", "vendredi", "samedi", "dimanche"] + months = ["janvier", "février", "mars", "avril", "mai", "juin", + "juillet", "août", "septembre", "octobre", "novembre", + "décembre"] + monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû", + "sept", "oct", "nov", "déc"] + # needed for format functions + months_en = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + used = 1 + if wordPrev in ["ce", "cet", "cette"]: + used = 2 + start -= 1 + # parse aujourd'hui, demain, après-demain + elif word == "aujourd'hui" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "demain" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "après-demain" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine + elif word in ["jour", "jours"]: + if wordPrev.isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + # "3e jour" + elif _get_ordinal_fr(wordPrev) is not None: + dayOffset += _get_ordinal_fr(wordPrev) - 1 + start -= 1 + used = 2 + elif word in ["semaine", "semaines"] and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordNext in ["prochaine", "suivante"]: + dayOffset = 7 + used = 2 + elif wordNext in ["dernière", "précédente"]: + dayOffset = -7 + used = 2 + # parse 10 mois, mois prochain, mois dernier + elif word == "mois" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext in ["prochain", "suivant"]: + monthOffset = 1 + used = 2 + elif wordNext in ["dernier", "précédent"]: + monthOffset = -1 + used = 2 + # parse 5 ans, an prochain, année dernière + elif word in ["an", "ans", "année", "années"] and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]: + yearOffset = 1 + used = 2 + elif wordNext in ["dernier", "dernière", "précédent", + "précédente"]: + yearOffset = -1 + used = 2 + # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordNext in ["prochain", "suivant"]: + dayOffset += 7 + used += 1 + elif wordNext in ["dernier", "précédent"]: + dayOffset -= 7 + used += 1 + # parse 15 juillet, 15 juil + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months_en[m] + if wordPrev and (wordPrev[0].isdigit()): + datestr += " " + wordPrev + start -= 1 + used += 1 + else: + datestr += " 1" + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 jours après demain, 10 semaines après jeudi prochain, + # 2 mois après juillet + validFollowups = days + months + monthsShort + validFollowups.append("aujourd'hui") + validFollowups.append("demain") + validFollowups.append("prochain") + validFollowups.append("prochaine") + validFollowups.append("suivant") + validFollowups.append("suivante") + validFollowups.append("dernier") + validFollowups.append("dernière") + validFollowups.append("précédent") + validFollowups.append("précédente") + validFollowups.append("maintenant") + if word in ["après", "depuis"] and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "demain": + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "prochain": + tmpOffset += 7 + used += 1 + elif wordNextNext == "dernier": + tmpOffset -= 7 + used += 1 + elif tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] in ["ce", "cette"]: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + ampm = "" + isTime = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + used = 0 + start = idx + + # parse midi et quart, minuit et demi, midi 10, minuit moins 20 + if word in ["midi", "minuit"]: + isTime = True + if word == "midi": + hrAbs = 12 + used += 1 + elif word == "minuit": + hrAbs = 0 + used += 1 + if wordNext.isdigit(): + minAbs = int(wordNext) + used += 1 + elif wordNext == "et": + if wordNextNext == "quart": + minAbs = 15 + used += 2 + elif wordNextNext == "demi": + minAbs = 30 + used += 2 + elif wordNext == "moins": + if wordNextNext.isdigit(): + minAbs = 60 - int(wordNextNext) + if not hrAbs: + hrAbs = 23 + else: + hrAbs -= 1 + used += 2 + if wordNextNext == "quart": + minAbs = 45 + if not hrAbs: + hrAbs = 23 + else: + hrAbs -= 1 + used += 2 + # parse une demi-heure, un quart d'heure + elif word == "demi-heure" or word == "heure" and \ + (wordPrevPrev in markers or wordPrevPrevPrev in markers): + used = 1 + isTime = True + if word == "demi-heure": + minOffset = 30 + elif wordPrev == "quart": + minOffset = 15 + used += 1 + start -= 1 + elif wordPrev == "quarts" and wordPrevPrev.isdigit(): + minOffset = int(wordPrevPrev) * 15 + used += 1 + start -= 1 + if wordPrev.isdigit() or wordPrevPrev.isdigit(): + start -= 1 + used += 1 + # parse 5:00 du matin, 12:00, etc + elif word[0].isdigit() and _get_ordinal_fr(word) is None: + isTime = True + if ":" in word or "h" in word or "min" in word: + # parse hours on short format + # "3:00 du matin", "4h14", "3h15min" + strHH = "" + strMM = "" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + used = 1 + elif word[i] in [":", "h", "m"]: + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + used = 1 + else: + stage = 2 + if word[i:i + 3] == "min": + i += 1 + elif stage == 2: + break + if wordPrev in words_in: + hrOffset = int(strHH) if strHH else 0 + minOffset = int(strMM) if strMM else 0 + else: + hrAbs = int(strHH) if strHH else 0 + minAbs = int(strMM) if strMM else 0 + else: + # try to parse time without colons + # 5 hours, 10 minutes etc. + length = len(word) + ampm = "" + if ( + word.isdigit() and + wordNext in ["heures", "heure"] and word != "0" and + ( + int(word) < 100 or + int(word) > 2400 + )): + # "dans 3 heures", "à 3 heures" + if wordPrev in words_in: + hrOffset = int(word) + else: + hrAbs = int(word) + used = 2 + idxHr = idx + 2 + # "dans 1 heure 40", "à 1 heure 40" + if idxHr < len(words): + # "3 heures 45" + if words[idxHr].isdigit(): + if wordPrev in words_in: + minOffset = int(words[idxHr]) + else: + minAbs = int(words[idxHr]) + used += 1 + idxHr += 1 + # "3 heures et quart", "4 heures et demi" + elif words[idxHr] == "et" and idxHr + 1 < len(words): + if words[idxHr + 1] == "quart": + if wordPrev in words_in: + minOffset = 15 + else: + minAbs = 15 + used += 2 + idxHr += 2 + elif words[idxHr + 1] == "demi": + if wordPrev in words_in: + minOffset = 30 + else: + minAbs = 30 + used += 2 + idxHr += 2 + # "5 heures moins 20", "6 heures moins le quart" + elif words[idxHr] == "moins" and \ + idxHr + 1 < len(words): + if words[idxHr + 1].isdigit(): + if wordPrev in words_in: + hrOffset -= 1 + minOffset = 60 - int(words[idxHr + 1]) + else: + hrAbs = hrAbs - 1 + minAbs = 60 - int(words[idxHr + 1]) + used += 2 + idxHr += 2 + elif words[idxHr + 1] == "quart": + if wordPrev in words_in: + hrOffset -= 1 + minOffset = 45 + else: + hrAbs = hrAbs - 1 + minAbs = 45 + used += 2 + idxHr += 2 + # remove word minutes if present + if idxHr < len(words) and \ + words[idxHr] in ["minutes", "minute"]: + used += 1 + idxHr += 1 + elif wordNext == "minutes": + # "dans 10 minutes" + if wordPrev in words_in: + minOffset = int(word) + else: + minAbs = int(word) + used = 2 + elif wordNext == "secondes": + # "dans 5 secondes" + secOffset = int(word) + used = 2 + elif int(word) > 100: + # format militaire + hrAbs = int(word) / 100 + minAbs = int(word) - hrAbs * 100 + used = 1 + if wordNext == "heures": + used += 1 + + # handle am/pm + if timeQualifier: + if timeQualifier == "matin": + ampm = "am" + elif timeQualifier == "après-midi": + ampm = "pm" + elif timeQualifier == "soir": + ampm = "pm" + elif timeQualifier == "nuit": + if (hrAbs or 0) > 8: + ampm = "pm" + else: + ampm = "am" + hrAbs = ((hrAbs or 0) + 12 if ampm == "pm" and (hrAbs or 0) < 12 + else hrAbs) + hrAbs = ((hrAbs or 0) - 12 if ampm == "am" and (hrAbs or 0) >= 12 + else hrAbs) + if (hrAbs or 0) > 24 or ((minAbs or 0) > 59): + isTime = False + used = 0 + elif wordPrev in words_in: + isTime = False + else: + isTime = True + + elif not hrAbs and timeQualifier: + if timeQualifier == "matin": + hrAbs = 8 + elif timeQualifier == "après-midi": + hrAbs = 15 + elif timeQualifier == "soir": + hrAbs = 19 + elif timeQualifier == "nuit": + hrAbs = 2 + isTime = True + + if used > 0: + # removed parsed words from the sentence + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + if not hasYear: + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "et" and words[idx - 1] == "" and \ + words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def normalize_fr(text, remove_articles=True): + """ French string normalization """ + text = text.lower() + words = text.split() # this also removed extra spaces + normalized = "" + i = 0 + while i < len(words): + # remove articles + if remove_articles and words[i] in _ARTICLES_FR: + i += 1 + continue + if remove_articles and words[i][:2] in ["l'", "d'"]: + words[i] = words[i][2:] + # remove useless punctuation signs + if words[i] in ["?", "!", ";", "…"]: + i += 1 + continue + # Normalize ordinal numbers + if i > 0 and words[i - 1] in _ARTICLES_FR: + result = _number_ordinal_fr(words, i) + if result is not None: + val, i = result + normalized += " " + str(val) + continue + # Convert numbers into digits + result = _number_parse_fr(words, i) + if result is not None: + val, i = result + normalized += " " + str(val) + continue + + normalized += " " + words[i] + i += 1 + + return normalized[1:] # strip the initial space diff --git a/ovos_date_parser/dates_hu.py b/ovos_date_parser/dates_hu.py new file mode 100644 index 0000000..58ad751 --- /dev/null +++ b/ovos_date_parser/dates_hu.py @@ -0,0 +1,80 @@ +from datetime import datetime + +from ovos_number_parser.numbers_hu import pronounce_number_hu, _NUM_STRING_HU + + +def nice_time_hu(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + speak += pronounce_number_hu(dt.hour) + speak = speak.replace(_NUM_STRING_HU[2], 'két') + speak += " óra" + if not dt.minute == 0: # zero minutes are not pronounced + speak += " " + pronounce_number_hu(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "éjfél" + if dt.hour == 12 and dt.minute == 0: + return "dél" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if dt.hour == 0: + speak += pronounce_number_hu(12) + elif dt.hour < 13: + speak = pronounce_number_hu(dt.hour) + else: + speak = pronounce_number_hu(dt.hour - 12) + + speak = speak.replace(_NUM_STRING_HU[2], 'két') + speak += " óra" + + if not dt.minute == 0: + speak += " " + pronounce_number_hu(dt.minute) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + speak = "délután " + speak # 12:01 - 17:59 + elif dt.hour < 22: + speak = "este " + speak # 18:00 - 21:59 este/evening + else: + speak = "éjjel " + speak # 22:00 - 23:59 éjjel/at night + elif dt.hour < 3: + speak = "éjjel " + speak # 00:01 - 02:59 éjjel/at night + else: + speak = "reggel " + speak # 03:00 - 11:59 reggel/in t. morning + + return speak diff --git a/ovos_date_parser/dates_it.py b/ovos_date_parser/dates_it.py new file mode 100644 index 0000000..141333d --- /dev/null +++ b/ovos_date_parser/dates_it.py @@ -0,0 +1,793 @@ +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_it import extract_number_it, pronounce_number_it +from ovos_utils.time import now_local + + +def nice_time_it(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + adapted to italian fron en version + + For example, generate 'cinque e trenta' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + # Either "zero 8 zerozero" o "13 zerozero" + if string[0:2] == '00': + speak += "zerozero" + elif string[0] == '0': + speak += pronounce_number_it(int(string[0])) + " " + if int(string[1]) == 1: + speak = "una" + else: + speak += pronounce_number_it(int(string[1])) + else: + speak = pronounce_number_it(int(string[0:2])) + + # in italian "13 e 25" + speak += " e " + + if string[3:5] == '00': + speak += "zerozero" + else: + if string[3] == '0': + speak += pronounce_number_it(0) + " " + speak += pronounce_number_it(int(string[4])) + else: + speak += pronounce_number_it(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "mezzanotte" + if dt.hour == 12 and dt.minute == 0: + return "mezzogiorno" + # TODO: "10 e un quarto", "4 e tre quarti" and ot her idiomatic times + + if dt.hour == 0: + speak = "mezzanotte" + elif dt.hour == 1 or dt.hour == 13: + speak = "una" + elif dt.hour > 13: # era minore + speak = pronounce_number_it(dt.hour - 12) + else: + speak = pronounce_number_it(dt.hour) + + speak += " e" + if dt.minute == 0: + speak = speak[:-2] + if not use_ampm: + speak += " in punto" + elif dt.minute == 15: + speak += " un quarto" + elif dt.minute == 45: + speak += " tre quarti" + else: + if dt.minute < 10: + speak += " zero" + speak += " " + pronounce_number_it(dt.minute) + + if use_ampm: + + if dt.hour < 4: + speak.strip() + elif dt.hour > 20: + speak += " della notte" + elif dt.hour > 17: + speak += " della sera" + elif dt.hour > 12: + speak += " del pomeriggio" + else: + speak += " della mattina" + + return speak + + +def extract_datetime_it(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + Normalize italian plurals + """ + symbols = ['.', ',', ';', '?', '!', 'º', 'ª', '°', 'l\''] + + for word in symbols: + s = s.replace(word, '') + + s = s.lower().replace('á', 'a').replace('à', 'a').replace('è', "e'") \ + .replace('é', "e'").replace('ì', 'i').replace('ù', 'u') \ + .replace('ò', 'o').replace('-', ' ').replace('_', '') + + # normalizza plurali per semplificare analisi + s = s.replace('secondi', 'secondo').replace('minuti', 'minuto') \ + .replace('ore', 'ora').replace('giorni', 'giorno') \ + .replace('settimane', 'settimana').replace('mesi', 'mese') \ + .replace('anni', 'anno').replace('mattino', 'mattina') \ + .replace('prossima', 'prossimo').replace('questa', 'questo') \ + .replace('quarti', 'quarto').replace('in punto', 'in_punto') \ + .replace('decennio', 'decenni').replace('secoli', 'secolo') \ + .replace('millennio', 'millenni').replace(' un ', ' uno ') \ + .replace('scorsa', 'scorso').replace('passata', 'passato') \ + .replace('uno paio', 'due') + + noise_words = ['dello', 'la', 'del', 'al', 'il', 'di', 'tra', 'lo', + 'le', 'alle', 'alla', 'dai', 'delle', 'della', + 'a', 'e\'', 'era', 'questa', 'questo', 'e', 'nel', + 'nello', 'dallo', ' '] + + word_list = s.split() + word_list = [x for x in word_list if x not in noise_words] + # normalizza alcuni formati orari + for idx in range(0, len(word_list) - 1): + if word_list[idx][0].isdigit() and word_list[idx + 1][0].isdigit(): + num0 = int(word_list[idx]) + num1 = int(word_list[idx + 1]) + if 0 <= num0 <= 23 and 10 <= num1 <= 59: + word_list[idx] = str(num0) + ':' + str(num1) + word_list[idx + 1] = '' + + word_list = [x for x in word_list if x] + + return word_list + + def date_found(): + return found or \ + (datestr != '' or time_str != '' or year_offset != 0 or + month_offset != 0 or day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or min_abs or sec_offset != 0) + + if text == '': + return None + anchorDate = anchorDate or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchorDate.strftime('%w') + current_year = anchorDate.strftime('%Y') + from_flag = False + datestr = '' + has_year = False + time_qualifier = '' + time_qualifiers_am = ['mattina', 'stamani', 'stamane'] + time_qualifiers_pm = ['pomeriggio', 'sera', 'stasera', 'stanotte'] + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['alle', 'in', 'questo', 'per', 'di', 'tra', 'fra', 'entro'] + days = ['lunedi', 'martedi', 'mercoledi', + 'giovedi', 'venerdi', 'sabato', 'domenica'] + months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', + 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', + 'dicembre'] + months_short = ['gen', 'feb', 'mar', 'apr', 'mag', 'giu', 'lug', 'ago', + 'set', 'ott', 'nov', 'dic'] + year_multiples = ['decenni', 'secolo', 'millenni'] # decennio <- decenni + time_multiples = ['ora', 'minuto', 'secondo'] + day_multiples = ['settimana', 'mese', 'anno'] + noise_words_2 = ['tra', 'di', 'per', 'fra', 'un ', 'uno', 'lo', 'del', + 'l', 'in_punto', ' ', 'nella', 'dell'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == '': + continue + word_prev_prev = words[idx - 2] if idx > 1 else '' + word_prev = words[idx - 1] if idx > 0 else '' + word_next = words[idx + 1] if idx + 1 < len(words) else '' + word_next_next = words[idx + 2] if idx + 2 < len(words) else '' + start = idx + used = 0 + # save timequalifier for later + if word == 'adesso' and not datestr: + # word == 'ora' va in conflitto con 'tra un ora' + words = [x for x in words if x != 'adesso'] + words = [x for x in words if x] + result_str = ' '.join(words) + extracted_date = anchorDate.replace(microsecond=0) + return [extracted_date, result_str] + + # un paio di o tra tre settimane --> secoli + elif extract_number_it(word) and (word_next in year_multiples or + word_next in day_multiples): + multiplier = int(extract_number_it(word)) + used += 2 + if word_next == 'decenni': + year_offset = multiplier * 10 + elif word_next == 'secolo': + year_offset = multiplier * 100 + elif word_next == 'millenni': + year_offset = multiplier * 1000 + elif word_next == 'anno': + year_offset = multiplier + elif word_next == 'mese': + month_offset = multiplier + elif word_next == 'settimana': + day_offset = multiplier * 7 + elif word in time_qualifiers_list: + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == 'oggi' and not from_flag: + day_offset = 0 + used += 1 + elif word == 'domani' and not from_flag: + day_offset = 1 + used += 1 + elif word == 'ieri' and not from_flag: + day_offset -= 1 + used += 1 + elif word == 'dopodomani' and not from_flag: # after tomorrow + day_offset += 2 + used += 1 + elif word == 'dopo' and word_next == 'domani' and not from_flag: + day_offset += 1 + used += 2 + elif word == 'giorno': + if word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + if word_next == 'dopo' and word_next_next == 'domani': + day_offset += 1 + used += 2 + elif word == 'settimana' and not from_flag: + if word_prev == 'prossimo': + day_offset = 7 + start -= 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + day_offset = -7 + start -= 1 + used = 2 + elif word_next == 'prossimo': + day_offset = 7 + used += 2 + elif word_next == 'passato' or word_next == 'scorso': + day_offset = -7 + used += 2 + # parse next month, last month + elif word == 'mese' and not from_flag: + if word_prev == 'prossimo': + month_offset = 1 + start -= 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + month_offset = -1 + start -= 1 + used = 2 + elif word_next == 'prossimo': + month_offset = 1 + used += 2 + elif word_next == 'passato' or word_next == 'scorso': + month_offset = -1 + used += 2 + # parse next year, last year + elif word == 'anno' and not from_flag: + if word_prev == 'prossimo': # prossimo anno + year_offset = 1 + start -= 1 + used = 2 + elif word_next == 'prossimo': # anno prossimo + year_offset = 1 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + year_offset = -1 + start -= 1 + used = 2 + elif word_next == 'passato' or word_next == 'scorso': + year_offset = -1 + used = 2 + elif word == 'decenni' and not from_flag: + if word_prev == 'prossimo': # prossimo mese + year_offset = 10 + start -= 1 + used = 2 + elif word_next == 'prossimo': # mese prossimo + year_offset = 10 + used = 2 + elif word_prev == 'passato' or word_prev == 'scorso': + year_offset = -10 + start -= 1 + used = 2 + elif word_next == 'passato' or word_next == 'scorso': + year_offset = -10 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + ddd = days.index(word) + day_offset = (ddd + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev == 'prossimo': + day_offset += 7 + start -= 1 + used += 1 + elif word_prev == 'passato' or word_prev == 'scorso': + day_offset -= 7 + start -= 1 + used += 1 + if word_next == 'prossimo': + day_offset += 7 + used += 1 + elif word_next == 'passato' or word_next == 'scorso': + day_offset -= 7 + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in months_short and not from_flag: + try: + mmm = months.index(word) + except ValueError: + mmm = months_short.index(word) + used += 1 + datestr = months[mmm] + if word_prev and extract_number_it(word_prev): + datestr += ' ' + str(int(extract_number_it(word_prev))) + start -= 1 + used += 1 + if word_next and extract_number_it(word_next): + datestr += ' ' + str(int(extract_number_it(word_next))) + used += 1 + has_year = True + else: + has_year = False + elif word_next and word_next[0].isdigit(): + datestr += ' ' + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + datestr += ' ' + word_next_next + used += 1 + has_year = True + else: + has_year = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + months_short + validFollowups.append('oggi') + validFollowups.append('domani') + validFollowups.append('prossimo') + validFollowups.append('passato') + validFollowups.append('adesso') + + if (word == 'da' or word == 'dopo') and word_next in validFollowups: + used = 0 + from_flag = True + if word_next == 'domani': + day_offset += 1 + used += 2 + elif word_next == 'oggi' or word_next == 'adesso': + used += 2 + elif word_next in days: + ddd = days.index(word_next) + tmp_offset = (ddd + 1) - int(today) + used += 2 + if tmp_offset < 0: + tmp_offset += 7 + if word_next_next == 'prossimo': + tmp_offset += 7 + used += 1 + elif word_next_next == 'passato' or word_next_next == 'scorso': + tmp_offset = (ddd + 1) - int(today) + used += 1 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + ddd = days.index(word_next_next) + tmp_offset = (ddd + 1) - int(today) + if word_next == 'prossimo': + tmp_offset += 7 + # elif word_next == 'passato' or word_next == 'scorso': + # tmp_offset -= 7 + day_offset += tmp_offset + used += 3 + + if used > 0: + if start - 1 > 0 and words[start - 1] == 'questo': + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = '' + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = '' + found = True + day_specified = True + + # parse time + time_str = '' + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + + for idx, word in enumerate(words): + if word == '': + continue + word_prev_prev = words[idx - 2] if idx > 1 else '' + word_prev = words[idx - 1] if idx > 0 else '' + word_next = words[idx + 1] if idx + 1 < len(words) else '' + word_next_next = words[idx + 2] if idx + 2 < len(words) else '' + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == 'mezzogiorno': + hr_abs = 12 + used += 1 + elif word == 'mezzanotte': + hr_abs = 24 + used += 1 + if word == 'mezzo' and word_next == 'giorno': + hr_abs = 12 + used += 2 + elif word == 'mezza' and word_next == 'notte': + hr_abs = 24 + used += 2 + elif word == 'mattina': + if not hr_abs: + hr_abs = 8 + used += 1 + if word_next and word_next[0].isdigit(): # mattina alle 5 + hr_abs = int(word_next) + used += 1 + elif word == 'pomeriggio': + if not hr_abs: + hr_abs = 15 + used += 1 + if word_next and word_next[0].isdigit(): # pomeriggio alle 5 + hr_abs = int(word_next) + used += 1 + if (hr_abs or 0) < 12: + hr_abs = (hr_abs or 0) + 12 + elif word == 'sera': + if not hr_abs: + hr_abs = 19 + used += 1 + if word_next and word_next[0].isdigit() \ + and ':' not in word_next: + hr_abs = int(word_next) + used += 1 + if (hr_abs or 0) < 12: + hr_abs = (hr_abs or 0) + 12 + # da verificare più a fondo + elif word == 'presto': + hr_abs -= 1 + used += 1 + elif word == 'tardi': + hr_abs += 1 + used += 1 + # un paio di minuti tra cinque minuti tra 5 ore + elif extract_number_it(word) and (word_next in time_multiples): + d_time = int(extract_number_it(word)) + used += 2 + if word_next == 'ora': + hr_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word_next == 'minuto': + min_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word_next == 'secondo': + sec_offset = d_time + isTime = False + hr_abs = -1 + min_abs = -1 + elif word == 'mezzora': + min_offset = 30 + used = 1 + isTime = False + hr_abs = -1 + min_abs = -1 + # if word_prev == 'uno' or word_prev == 'una': + # start -= 1 + # used += 1 + elif extract_number_it(word) and word_next and \ + word_next == 'quarto' and word_next_next == 'ora': + if int(extract_number_it(word)) == 1 \ + or int(extract_number_it(word)) == 3: + min_offset = 15 * int(extract_number_it(word)) + else: # elimina eventuali errori + min_offset = 15 + used = 3 + start -= 1 + isTime = False + hr_abs = -1 + min_abs = -1 + elif word[0].isdigit(): + isTime = True + str_hh = '' + str_mm = '' + remainder = '' + if ':' in word: + # parse colons + # '3:00 in the morning' + components = word.split(':') + if len(components) == 2: + num0 = int(extract_number_it(components[0])) + num1 = int(extract_number_it(components[1])) + if num0 is not False and num1 is not False \ + and 0 <= num0 <= 23 and 0 <= num1 <= 59: + str_hh = str(num0) + str_mm = str(num1) + elif 0 < int(extract_number_it(word)) < 24 \ + and word_next != 'quarto': + str_hh = str(int(word)) + str_mm = '00' + elif 100 <= int(word) <= 2400: + str_hh = int(word) / 100 + str_mm = int(word) - str_hh * 100 + military = True + isTime = False + if extract_number_it(word) and word_next \ + and word_next == 'quarto' and word_next_next != 'ora': + if int(extract_number_it(word)) == 1 \ + or int(extract_number_it(word)) == 3: + str_mm = str(15 * int(extract_number_it(word))) + else: # elimina eventuali errori + str_mm = '0' + str_hh = str(hr_abs) + used = 2 + words[idx + 1] = '' + isTime = False + if extract_number_it(word) and word_next \ + and word_next == 'in_punto': + str_hh = str(int(extract_number_it(word))) + used = 2 + if word_next == 'pm': + remainder = 'pm' + hr_abs = int(str_hh) + min_abs = int(str_mm) + if hr_abs <= 12: + hr_abs = hr_abs + 12 + used = 2 + elif word_next == 'am': + remainder = 'am' + hr_abs = int(str_hh) + min_abs = int(str_mm) + used = 2 + elif word_next == 'mattina': + # ' 11 del mattina' + hh = int(str_hh) + mm = int(str_mm) + used = 2 + remainder = 'am' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'pomeriggio': + # ' 2 del pomeriggio' + hh = int(str_hh) + mm = int(str_mm) + if hh < 12: + hh += 12 + used = 2 + remainder = 'pm' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'sera': + # 'alle 8 di sera' + hh = int(str_hh) + mm = int(str_mm) + if hh < 12: + hh += 12 + used = 2 + remainder = 'pm' + isTime = False + hr_abs = hh + min_abs = mm + elif word_next == 'notte': + hh = int(str_hh) + mm = int(str_mm) + if hh > 5: + remainder = 'pm' + else: + remainder = 'am' + used = 2 + isTime = False + hr_abs = hh + min_abs = mm + # parse half an hour : undici e mezza + elif word_next and word_next == 'mezza': + hr_abs = int(str_hh) + min_abs = 30 + used = 2 + isTime = False + elif word_next and word_next == 'in_punto': + hr_abs = int(str_hh) + min_abs = 0 + str_mm = '0' + used = 2 + isTime = False + else: + # 17:30 + remainder = '' + hr_abs = int(str_hh) + min_abs = int(str_mm) + used = 1 + isTime = False + if word_prev == 'ora': + words[idx - 1] = '' + + if time_qualifier != '': + # military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh = str(int(str_hh) + 12) + else: + isTime = False + + str_hh = int(str_hh) if str_hh else 0 + str_mm = int(str_mm) if str_mm else 0 + + str_hh = str_hh + 12 if remainder == 'pm' \ + and str_hh < 12 else str_hh + str_hh = str_hh - 12 if remainder == 'am' \ + and str_hh >= 12 else str_hh + + if (not military and + remainder not in ['am', 'pm'] and + ((not day_specified) or day_offset < 1)): + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + hr_abs = str_hh + if anchorDate.hour < str_hh: + pass # No modification needed + elif anchorDate.hour < str_hh + 12: + str_hh += 12 + hr_abs = str_hh + else: + # has passed, assume the next morning + day_offset += 1 + + if time_qualifier in time_qualifiers_pm and str_hh < 12: + str_hh += 12 + + if str_hh > 24 or str_mm > 59: + isTime = False + used = 0 + if isTime: + hr_abs = str_hh * 1 + min_abs = str_mm * 1 + used += 1 + + if (hr_abs or 0) <= 12 and (time_qualifier == 'sera' or + time_qualifier == 'pomeriggio'): + hr_abs = (hr_abs or 0) + 12 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = '' + + if word_prev == 'o' or word_prev == 'oh': + words[words.index(word_prev)] = '' + + if idx > 0 and word_prev in markers: + words[idx - 1] = '' + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = '' + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchorDate.replace(microsecond=0) + + if datestr != '': + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', 'sept', 'oct', 'nov', 'dec'] + + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + + for idx, en_month in enumerate(en_months_short): + datestr = datestr.replace(months_short[idx], en_month) + + try: + temp = datetime.strptime(datestr, '%B %d') + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, '%B %d %Y') + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime('%Y')), + month=int(temp.strftime('%m')), + day=int(temp.strftime('%d')), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and datestr == '': + if not day_specified and anchorDate > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + + words = [x for x in words if x not in noise_words_2] + words = [x for x in words if x] + result_str = ' '.join(words) + + return [extracted_date, result_str] diff --git a/ovos_date_parser/dates_nl.py b/ovos_date_parser/dates_nl.py new file mode 100644 index 0000000..06f25bf --- /dev/null +++ b/ovos_date_parser/dates_nl.py @@ -0,0 +1,943 @@ +from datetime import datetime, timedelta +import re + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_nl import pronounce_number_nl, extract_number_nl, _convert_words_to_numbers_nl +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + + +def extract_duration_nl(text): + """Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + nl_translations = { + 'microseconds': ["microsecond", "microseconde", "microseconden", "microsecondje", "microsecondjes"], + 'milliseconds': ["millisecond", "milliseconde", "milliseconden", "millisecondje", "millisecondjes"], + 'seconds': ["second", "seconde", "seconden", "secondje", "secondjes"], + 'minutes': ["minuut", "minuten", "minuutje", "minuutjes"], + 'hours': ["uur", "uren", "uurtje", "uurtjes"], + 'days': ["dag", "dagen", "dagje", "dagjes"], + 'weeks': ["week", "weken", "weekje", "weekjes"] + } + + pattern = r"(?P\d+(?:\.?\d+)?)\s+{unit}" + text = _convert_words_to_numbers_nl(text) + + for unit in time_units: + unit_nl_words = nl_translations[unit] + unit_nl_words.sort(key=len, reverse=True) + for unit_nl in unit_nl_words: + unit_pattern = pattern.format(unit=unit_nl) + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + time_units[unit] = time_units[unit] + value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_nl(text, anchorDate=None, default_time=None): + """Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + dateNow (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' de ', ' ').replace(' het ', ' ').replace(' het ', ' ') \ + .replace("paar", "2").replace("eeuwen", "eeuw") \ + .replace("decennia", "decennium") \ + .replace("millennia", "millennium") + + wordList = s.split() + for idx, word in enumerate(wordList): + ordinals = ["ste", "de"] + if word[0].isdigit(): + for ordinal in ordinals: + # "second" is the only case we should not do this + if ordinal in word and "second" not in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['ochtend'] + timeQualifiersPM = ['middag', 'avond', 'nacht'] + timeQualifiersList = timeQualifiersAM + timeQualifiersPM + timeQualifierOffsets = [8, 15, 19, 0] + markers = ['op', 'in', 'om', 'tegen', 'over', + 'deze', 'rond', 'voor', 'van', "binnen"] + days = ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", + "zaterdag", "zondag"] + day_parts = [a + b for a in days for b in timeQualifiersList] + months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', + 'juli', 'augustus', 'september', 'oktober', 'november', + 'december'] + recur_markers = days + [d + 'en' for d in days] + ['weekeinde', 'werkdag', + 'weekeinden', 'werkdagen'] + months_short = ['jan', 'feb', 'mar', 'apr', 'mei', 'jun', 'jul', 'aug', + 'sep', 'okt', 'nov', 'dec'] + year_multiples = ["decennium", "eeuw", "millennium"] + day_multiples = ["dagen", "weken", "maanden", "jaren"] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + + if word == "nu" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_nl(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "decennium": + yearOffset = multiplier * 10 + elif wordNext == "eeuw": + yearOffset = multiplier * 100 + elif wordNext == "millennium": + yearOffset = multiplier * 1000 + # paar + elif word == "2" and \ + wordNextNext in year_multiples: + multiplier = 2 + used += 2 + if wordNextNext == "decennia": + yearOffset = multiplier * 10 + elif wordNextNext == "eeuwen": + yearOffset = multiplier * 100 + elif wordNextNext == "millennia": + yearOffset = multiplier * 1000 + elif word == "2" and \ + wordNextNext in day_multiples: + multiplier = 2 + used += 2 + if wordNextNext == "jaren": + yearOffset = multiplier + elif wordNextNext == "maanden": + monthOffset = multiplier + elif wordNextNext == "weken": + dayOffset = multiplier * 7 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "vandaag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "morgen" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "overmorgen" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dagen": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "week" or word == "weken" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "volgende": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "vorige": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "maand" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "volgende": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "vorige": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "jaar" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "volgend": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "vorig": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "volgende": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "vorige": + dayOffset -= 7 + used += 1 + start -= 1 + elif word in day_parts and not fromFlag: + d = day_parts.index(word) / len(timeQualifiersList) + dayOffset = (d + 1) - int(today) + if dayOffset < 0: + dayOffset += 7 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in months_short and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + datestr = months[m] + if wordPrev and \ + (wordPrev[0].isdigit() or (wordPrev == "van" and + wordPrevPrev[0].isdigit())): + if wordPrev == "van" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + months_short + validFollowups.append("vandaag") + validFollowups.append("morgen") + validFollowups.append("volgende") + validFollowups.append("vorige") + validFollowups.append("nu") + if (word == "van" or word == "na") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "morgen": + dayOffset += 1 + elif wordNext == "overmorgen": + dayOffset += 2 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "volgende": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "vorige": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "deze": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse nacht ochtend, middag, avond + used = 0 + if word.startswith("gister"): + dayOffset = -1 + elif word.startswith("morgen"): + dayOffset = 1 + + if word.endswith("nacht"): + if hrAbs is None: + hrAbs = 0 + used += 1 + elif word.endswith("ochtend"): + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word.endswith("middag"): + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word.endswith("avond"): + if hrAbs is None: + hrAbs = 19 + used += 1 + + # "paar" time_unit + elif word == "2" and \ + wordNextNext in ["uur", "minuten", "seconden"]: + used += 2 + if wordNextNext == "uur": + hrOffset = 2 + elif wordNextNext == "minuten": + minOffset = 2 + elif wordNextNext == "seconden": + secOffset = 2 + # parse half an hour, quarter hour + elif word == "uur" and \ + (wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "half": + minOffset = 30 + elif wordPrev == "kwartier": + minOffset = 15 + elif wordPrevPrev == "kwartier": + minOffset = 15 + if idx > 2 and words[idx - 3] in markers: + words[idx - 3] = "" + if words[idx - 3] == "deze": + daySpecified = True + words[idx - 2] = "" + elif wordPrev == "binnen": + hrOffset = 1 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "deze": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse "over een minuut" + elif word == "minuut" and wordPrev == "over": + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse "over een seconde" + elif word == "seconde" and wordPrev == "over": + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + if wordNext == "vannacht" or wordNextNext == "vannacht" or \ + wordPrev == "vannacht" or wordPrevPrev == "vannacht" or \ + wordNextNextNext == "vannacht": + remainder = "pm" + used += 1 + if wordPrev == "vannacht": + words[idx - 1] = "" + if wordPrevPrev == "vannacht": + words[idx - 2] = "" + if wordNextNext == "vannacht": + used += 1 + if wordNextNextNext == "vannacht": + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + + elif wordNext == "in" and wordNextNext == "ochtend": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "middag": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "avond": + remainder = "pm" + used += 2 + elif wordNext == "'s" and wordNextNext == "ochtends": + remainder = "am" + used += 2 + elif wordNext == "'s" and wordNextNext == "middags": + remainder = "pm" + used += 2 + elif wordNext == "'s" and wordNextNext == "avonds": + remainder = "pm" + used += 2 + elif wordNext == "deze" and wordNextNext == "ochtend": + remainder = "am" + used = 2 + daySpecified = True + elif wordNext == "deze" and wordNextNext == "middag": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "deze" and wordNextNext == "avond": + remainder = "pm" + used = 2 + daySpecified = True + elif wordNext == "'s" and wordNextNext == "nachts": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + (wordNext == "uren" or wordNext == "uur" or + remainder == "uren" or remainder == "uur") and + word[0] != '0' and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuten" or wordNext == "minuut" or \ + remainder == "minuten" or remainder == "minuut": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconden" or wordNext == "seconde" \ + or remainder == "seconden" or \ + remainder == "seconde": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "uur" or remainder == "uur": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext == "uur" or remainder == "uur"): + used += 1 + elif ( + wordNext == "" or wordNext == "uur" or + ( + wordNext == "in" and + ( + wordNextNext == "de" or + wordNextNext == timeQualifier + ) + ) or wordNext == 'vannacht' or + wordNextNext == 'vannacht'): + + strHH = strNum + strMM = "00" + if wordNext == "uur": + used += 1 + + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm', 'uren', 'minuten', + "seconde", "seconden", + "uur", "minuut"] and + ((not daySpecified) or dayOffset < 1)): + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchorDate.hour < HH or (anchorDate.hour == HH and + anchorDate.minute < MM): + pass # No modification needed + elif anchorDate.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "vroeg": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "laat": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "deze": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "deze": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate.replace(hour=hrAbs, + minute=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "en" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def nice_time_nl(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + speak += pronounce_number_nl(dt.hour) + speak += " uur" + if not dt.minute == 0: # zero minutes are not pronounced, 13:00 is + # "13 uur" not "13 hundred hours" + speak += " " + pronounce_number_nl(dt.minute) + return speak # ampm is ignored when use_24hour is true + else: + if dt.hour == 0 and dt.minute == 0: + return "Middernacht" + hour = dt.hour % 12 + if dt.minute == 0: + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + speak += " uur" + elif dt.minute == 30: + speak += "half " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute == 15: + speak += "kwart over " + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute == 45: + speak += "kwart voor " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + elif dt.minute > 30: + speak += pronounce_number_nl(60 - dt.minute) + speak += " voor " + hour += 1 + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + else: + speak += pronounce_number_nl(dt.minute) + speak += " over " + hour = _fix_hour_nl(hour) + speak += pronounce_number_nl(hour) + + if use_ampm: + speak += nice_part_of_day_nl(dt) + + return speak + + +def _fix_hour_nl(hour): + hour = hour % 12 + if hour == 0: + hour = 12 + return hour + + +def nice_part_of_day_nl(dt, speech=True): + if dt.hour < 6: + return " 's nachts" + if dt.hour < 12: + return " 's ochtends" + if dt.hour < 18: + return " 's middags" + if dt.hour < 24: + return " 's avonds" + raise ValueError('dt.hour is bigger than 24') diff --git a/ovos_date_parser/dates_pl.py b/ovos_date_parser/dates_pl.py new file mode 100644 index 0000000..1044512 --- /dev/null +++ b/ovos_date_parser/dates_pl.py @@ -0,0 +1,1075 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_pl import pronounce_number_pl, extract_number_pl, _convert_words_to_numbers_pl +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + +_TIME_UNITS_CONVERSION = { + 'mikrosekund': 'microseconds', + 'mikrosekundy': 'microseconds', + 'milisekund': 'milliseconds', + 'milisekundy': 'milliseconds', + 'sekunda': 'seconds', + 'sekundy': 'seconds', + 'sekund': 'seconds', + 'minuta': 'minutes', + 'minuty': 'minutes', + 'minut': 'minutes', + 'godzina': 'hours', + 'godziny': 'hours', + 'godzin': 'hours', + 'dzień': 'days', + 'dni': 'days', + 'tydzień': 'weeks', + 'tygodni': 'weeks', + 'tygodnie': 'weeks', + 'tygodniu': 'weeks', +} + +_TIME_UNITS_NORMALIZATION = { + 'mikrosekunda': 'mikrosekunda', + 'mikrosekundę': 'mikrosekunda', + 'mikrosekund': 'mikrosekunda', + 'mikrosekundy': 'mikrosekunda', + 'milisekunda': 'milisekunda', + 'milisekundę': 'milisekunda', + 'milisekund': 'milisekunda', + 'milisekundy': 'milisekunda', + 'sekunda': 'sekunda', + 'sekundę': 'sekunda', + 'sekundy': 'sekunda', + 'sekund': 'sekunda', + 'minuta': 'minuta', + 'minutę': 'minuta', + 'minut': 'minuta', + 'minuty': 'minuta', + 'godzina': 'godzina', + 'godzinę': 'godzina', + 'godzin': 'godzina', + 'godziny': 'godzina', + 'dzień': 'dzień', + 'dni': 'dzień', + 'tydzień': 'tydzień', + 'tygodni': 'tydzień', + 'tygodnie': 'tydzień', + 'tygodniu': 'tydzień', + 'miesiąc': 'miesiąc', + 'miesiące': 'miesiąc', + 'miesięcy': 'miesiąc', + 'rok': 'rok', + 'lata': 'rok', + 'lat': 'rok', + 'dekada': 'dekada', + 'dekad': 'dekada', + 'dekady': 'dekada', + 'dekadę': 'dekada', + 'wiek': 'wiek', + 'wieki': 'wiek', + 'milenia': 'milenia', + 'milenium': 'milenia', +} + +_MONTHS_TO_EN = { + 'styczeń': 'January', + 'stycznia': 'January', + 'luty': 'February', + 'lutego': 'February', + 'marzec': 'March', + 'marca': 'March', + 'kwiecień': 'April', + 'kwietnia': 'April', + 'maj': 'May', + 'maja': 'May', + 'czerwiec': 'June', + 'czerwca': 'June', + 'lipiec': 'July', + 'lipca': 'July', + 'sierpień': 'August', + 'sierpnia': 'August', + 'wrzesień': 'September', + 'września': 'September', + 'październik': 'October', + 'października': 'October', + 'listopad': 'November', + 'listopada': 'November', + 'grudzień': 'December', + 'grudnia': 'December', +} + +_DAYS_TO_EN = { + 'poniedziałek': 0, + 'poniedziałkach': 0, + 'poniedziałkami': 0, + 'poniedziałki': 0, + 'poniedziałkiem': 0, + 'poniedziałkom': 0, + 'poniedziałkowa': 0, + 'poniedziałkową': 0, + 'poniedziałkowe': 0, + 'poniedziałkowego': 0, + 'poniedziałkowej': 0, + 'poniedziałkowemu': 0, + 'poniedziałkowi': 0, + 'poniedziałkowy': 0, + 'poniedziałkowych': 0, + 'poniedziałkowym': 0, + 'poniedziałkowymi': 0, + 'poniedziałków': 0, + 'poniedziałku': 0, + 'wtorek': 1, + 'wtorkach': 1, + 'wtorkami': 1, + 'wtorki': 1, + 'wtorkiem': 1, + 'wtorkom': 1, + 'wtorkowa': 1, + 'wtorkową': 1, + 'wtorkowe': 1, + 'wtorkowego': 1, + 'wtorkowej': 1, + 'wtorkowemu': 1, + 'wtorkowi': 1, + 'wtorkowy': 1, + 'wtorkowych': 1, + 'wtorkowym': 1, + 'wtorkowymi': 1, + 'wtorków': 1, + 'wtorku': 1, + 'środa': 2, + 'środach': 2, + 'środami': 2, + 'środą': 2, + 'środę': 2, + 'środo': 2, + 'środom': 2, + 'środowa': 2, + 'środową': 2, + 'środowe': 2, + 'środowego': 2, + 'środowej': 2, + 'środowemu': 2, + 'środowi': 2, + 'środowy': 2, + 'środowych': 2, + 'środowym': 2, + 'środowymi': 2, + 'środy': 2, + 'środzie': 2, + 'śród': 2, + 'czwartek': 3, + 'czwartkach': 3, + 'czwartkami': 3, + 'czwartki': 3, + 'czwartkiem': 3, + 'czwartkom': 3, + 'czwartkowa': 3, + 'czwartkową': 3, + 'czwartkowe': 3, + 'czwartkowego': 3, + 'czwartkowej': 3, + 'czwartkowemu': 3, + 'czwartkowi': 3, + 'czwartkowy': 3, + 'czwartkowych': 3, + 'czwartkowym': 3, + 'czwartkowymi': 3, + 'czwartków': 3, + 'czwartku': 3, + 'piątek': 4, + 'piątkach': 4, + 'piątkami': 4, + 'piątki': 4, + 'piątkiem': 4, + 'piątkom': 4, + 'piątkowa': 4, + 'piątkową': 4, + 'piątkowe': 4, + 'piątkowego': 4, + 'piątkowej': 4, + 'piątkowemu': 4, + 'piątkowi': 4, + 'piątkowy': 4, + 'piątkowych': 4, + 'piątkowym': 4, + 'piątkowymi': 4, + 'piątków': 4, + 'piątku': 4, + 'sobocie': 5, + 'sobota': 5, + 'sobotach': 5, + 'sobotami': 5, + 'sobotą': 5, + 'sobotę': 5, + 'sobotni': 5, + 'sobotnia': 5, + 'sobotnią': 5, + 'sobotnich': 5, + 'sobotnie': 5, + 'sobotniego': 5, + 'sobotniej': 5, + 'sobotniemu': 5, + 'sobotnim': 5, + 'sobotnimi': 5, + 'soboto': 5, + 'sobotom': 5, + 'soboty': 5, + 'sobót': 5, + 'niedziel': 6, + 'niedziela': 6, + 'niedzielach': 6, + 'niedzielami': 6, + 'niedzielą': 6, + 'niedziele': 6, + 'niedzielę': 6, + 'niedzieli': 6, + 'niedzielna': 6, + 'niedzielną': 6, + 'niedzielne': 6, + 'niedzielnego': 6, + 'niedzielnej': 6, + 'niedzielnemu': 6, + 'niedzielni': 6, + 'niedzielny': 6, + 'niedzielnych': 6, + 'niedzielnym': 6, + 'niedzielnymi': 6, + 'niedzielo': 6, + 'niedzielom': 6 +} + + +def nice_time_pl(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + string = dt.strftime("%H:%M") + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0:2] == '00': + speak = "" + elif string[0] == '0': + speak += pronounce_number_pl(int(string[1]), ordinals=True) + speak = speak[:-1] + 'a' + else: + speak = pronounce_number_pl(int(string[0:2]), ordinals=True) + speak = speak[:-1] + 'a' + + speak += ' ' if string[0:2] != '00' else '' + if string[3:5] == '00': + speak += 'zero zero' + else: + if string[3] == '0': + speak += pronounce_number_pl(int(string[4])) + else: + speak += pronounce_number_pl(int(string[3:5])) + + if string[0:2] == '00': + speak += " po północy" + return speak + + +def nice_duration_pl(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + if not speech: + raise NotImplementedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + sec_main, sec_div = divmod(seconds, 10) + min_main, min_div = divmod(minutes, 10) + hour_main, hour_div = divmod(hours, 10) + + if days > 0: + out += pronounce_number_pl(days) + " " + if days == 1: + out += 'dzień' + else: + out += 'dni' + if hours > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(hours) + " " + if hours == 1: + out += 'godzina' + elif hour_main == 1 or hour_div > 4: + out += 'godzin' + else: + out += 'godziny' + if minutes > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(minutes) + " " + if minutes == 1: + out += 'minuta' + elif min_main == 1 or min_div > 4: + out += 'minut' + else: + out += 'minuty' + if seconds > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(seconds) + " " + if sec_div == 0: + out += 'sekund' + elif seconds == 1: + out += 'sekunda' + elif sec_main == 1 or sec_div > 4: + out += 'sekund' + else: + out += 'sekundy' + + return out + + +def get_pronounce_number_for_duration(num): + pronounced = pronounce_number_pl(num) + + return 'jedna' if pronounced == 'jeden' else pronounced + + +def extract_duration_pl(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': None, + 'milliseconds': None, + 'seconds': None, + 'minutes': None, + 'hours': None, + 'days': None, + 'weeks': None + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ayeę]?" + text = _convert_words_to_numbers_pl(text) + + for unit in _TIME_UNITS_CONVERSION: + unit_pattern = pattern.format(unit=unit) + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + unit_en = _TIME_UNITS_CONVERSION.get(unit) + if time_units[unit_en] is None or time_units.get(unit_en) == 0: + time_units[unit_en] = value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_pl(string, dateNow=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + string (str): string containing date words + dateNow (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("para", "2") + + wordList = s.split() + for idx, word in enumerate(wordList): + ordinals = ["ci", "szy", "gi"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if string == "": + return None + + dateNow = dateNow or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['rano'] + timeQualifiersPM = ['wieczór', 'w nocy'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['na', 'w', 'we', 'na', 'przez', 'ten', 'około', 'dla', 'o', "pomiędzy", 'za', 'do'] + days = list(_DAYS_TO_EN.keys()) + recur_markers = days + ['weekend', 'weekendy'] + monthsShort = ['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', + 'wrz', 'paź', 'lis', 'gru'] + year_multiples = ['dekada', 'wiek', 'milenia'] + + words = clean_string(string) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + # save timequalifier for later + if word == 'w' and wordNext == 'tę': + used += 2 + if word == "temu" and dayOffset: + dayOffset = - dayOffset + used += 1 + if word == "teraz" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = dateNow.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_pl(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "dekada": + yearOffset = multiplier * 10 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "wiek": + yearOffset = multiplier * 100 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "milenia": + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dzisiaj" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "jutro" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "przedwczoraj" and not fromFlag: + dayOffset = -2 + used += 1 + elif word == "wczoraj" and not fromFlag: + dayOffset = -1 + used += 1 + elif word == "pojutrze" and not fromFlag: + dayOffset = 2 + used = 1 + elif word == "dzień" and wordNext != 'robocze': + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "tydzień" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "następny": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "miesiąc" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "rok" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = _DAYS_TO_EN.get(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "następny": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in _MONTHS_TO_EN or word in monthsShort and not fromFlag: + used += 1 + datestr = _MONTHS_TO_EN[word] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + list(_MONTHS_TO_EN.keys()) + monthsShort + validFollowups.append("dzisiaj") + validFollowups.append("jutro") + validFollowups.append("wczoraj") + validFollowups.append("następny") + validFollowups.append("poprzedni") + validFollowups.append('ostatni') + validFollowups.append("teraz") + validFollowups.append("tego") + if (word == "od" or word == "po") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "jutro": + dayOffset += 1 + elif wordNext == "wczoraj": + dayOffset -= 1 + elif wordNext in days: + d = _DAYS_TO_EN.get(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = _DAYS_TO_EN.get(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "następny": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "poprzedni" or wordNext == 'ostatni': + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "ten": # this + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "południe": + hrAbs = 12 + used += 1 + elif word == "północ" or word == 'północy': + hrAbs = 0 + used += 1 + elif word == "rano": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "po" and wordNext == "południu": + if hrAbs is None: + hrAbs = 15 + used += 2 + elif word == "wieczór" or word == 'wieczorem': + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "nocy": + if hrAbs is None: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "godzina" and (wordPrev.isdigit() or wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "pół": + minOffset = 30 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "dzisiaj": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minuta" and (wordPrev.isdigit() or wordPrev in markers): + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "sekunda" and (wordPrev.isdigit() or wordPrev in markers): + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if wordNext == "wieczorem" or wordPrev == "wieczorem" or \ + wordNext == 'wieczór' or wordPrev == 'wieczór' or \ + (wordNext == 'po' and wordNextNext == 'południu'): + remainder = "pm" + used += 2 if wordNext == 'po' else 1 + if wordPrev == "wieczorem" or wordPrev == 'wieczór': + words[idx - 1] = "" + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + if wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "po" and wordNextNext == "południu": + remainder = "pm" + used += 2 + elif wordNext == "wieczorem": + remainder = "pm" + used += 1 + elif wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "w" and wordNextNext == "nocy": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + (word[0].isdigit() and (wordNext == 'wieczorem' or wordNext == 'wieczór')) or + (word[0].isdigit() and wordNext == 'po' and wordNextNext == 'południu') or + (word[0].isdigit() and wordNext == 'w' and wordNextNext == 'nocy')): + strHH = strNum + remainder = "pm" + used = 2 if wordNext in ['po', 'w'] else 1 + elif ( + remainder == "am" or + (word[0].isdigit() and wordNext == 'rano')): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers or ( + wordNext == 'w' and wordNextNext == 'dzień' and + wordNextNextNext == 'robocze' + )): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + # "in 10 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "minuta" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "minuta": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "sekunda" \ + or _TIME_UNITS_NORMALIZATION.get(remainder) == "sekunda": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + elif ( + wordNext == "" or wordNext == "w" or wordNext == 'nocy' or + wordNextNext == 'nocy'): + strHH = strNum + strMM = "00" + + if wordNext == "za" or wordNextNext == "za": + used += (1 if wordNext == "za" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm'] and + remainder not in _TIME_UNITS_NORMALIZATION and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if dateNow.hour < HH or (dateNow.hour == HH and + dateNow.minute < MM): + pass # No modification needed + elif dateNow.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "rano": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "wieczorem": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "najbliższą": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "najbliższą": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "i" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] diff --git a/ovos_date_parser/dates_pt.py b/ovos_date_parser/dates_pt.py new file mode 100644 index 0000000..fc6ea62 --- /dev/null +++ b/ovos_date_parser/dates_pt.py @@ -0,0 +1,1109 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_pt import pronounce_number_pt +from ovos_utils.time import now_local, DAYS_IN_1_YEAR, DAYS_IN_1_MONTH + + +def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'cinco treinta' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + # simply speak the number + if dt.hour == 1: + speak += "uma" + else: + speak += pronounce_number_pt(dt.hour) + + # equivalent to "quarter past ten" + if dt.minute > 0: + speak += " e " + pronounce_number_pt(dt.minute) + + else: + # speak number and add daytime identifier + # (equivalent to "in the morning") + if dt.minute == 35: + minute = -25 + hour = dt.hour + 1 + elif dt.minute == 40: + minute = -20 + hour = dt.hour + 1 + elif dt.minute == 45: + minute = -15 + hour = dt.hour + 1 + elif dt.minute == 50: + minute = -10 + hour = dt.hour + 1 + elif dt.minute == 55: + minute = -5 + hour = dt.hour + 1 + else: + minute = dt.minute + hour = dt.hour + + if hour == 0: + speak += "meia noite" + elif hour == 12: + speak += "meio dia" + # 1 and 2 are pronounced in female form when talking about hours + elif hour == 1 or hour == 13: + speak += "uma" + elif hour == 2 or hour == 14: + speak += "duas" + elif hour < 13: + speak = pronounce_number_pt(hour) + else: + speak = pronounce_number_pt(hour - 12) + + if minute != 0: + if minute == 15: + speak += " e um quarto" + elif minute == 30: + speak += " e meia" + elif minute == -15: + speak += " menos um quarto" + else: + if minute > 0: + speak += " e " + pronounce_number_pt(minute) + else: + speak += " " + pronounce_number_pt(minute) + + # exact time + if minute == 0 and not use_ampm: + # 3:00 + speak += " em ponto" + + if use_ampm: + if hour > 0 and hour < 6: + speak += " da madrugada" + elif hour >= 6 and hour < 12: + speak += " da manhã" + elif hour >= 13 and hour < 21: + speak += " da tarde" + elif hour != 0 and hour != 12: + speak += " da noite" + return speak + + +def extract_datetime_pt(text, anchorDate=None, default_time=None): + def clean_string(s): + # cleans the input string of unneeded punctuation and capitalization + # among other things + symbols = [".", ",", ";", "?", "!", "º", "ª"] + noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", + "ao", "aos"] + + for word in symbols: + s = s.replace(word, "") + for word in noise_words: + s = s.replace(" " + word + " ", " ") + s = s.lower().replace( + "á", + "a").replace( + "ç", + "c").replace( + "à", + "a").replace( + "ã", + "a").replace( + "é", + "e").replace( + "è", + "e").replace( + "ê", + "e").replace( + "ó", + "o").replace( + "ò", + "o").replace( + "-", + " ").replace( + "_", + "") + # handle synonims and equivalents, "tomorrow early = tomorrow morning + synonims = {"manha": ["manhazinha", "cedo", "cedinho"], + "tarde": ["tardinha", "tarde"], + "noite": ["noitinha", "anoitecer"], + "todos": ["ao", "aos"], + "em": ["do", "da", "dos", "das", "de"]} + for syn in synonims: + for word in synonims[syn]: + s = s.replace(" " + word + " ", " " + syn + " ") + # relevant plurals, cant just extract all s in pt + wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", + "minutos", "segundos", "nas", "nos", "proximas", + "seguintes", "horas"] + for _, word in enumerate(wordlist): + s = s.replace(word, word.rstrip('s')) + s = s.replace("meses", "mes").replace("anteriores", "anterior") + return s + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + words = clean_string(text).split(" ") + timeQualifiersList = ['manha', 'tarde', 'noite'] + time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", + "no", "dia", "hora"] + days = ['segunda', 'terca', 'quarta', + 'quinta', 'sexta', 'sabado', 'domingo'] + months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', + 'julho', 'agosto', 'setembro', 'outubro', 'novembro', + 'dezembro'] + monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', + 'set', 'out', 'nov', 'dec'] + nexts = ["proximo", "proxima"] + suffix_nexts = ["seguinte", "subsequente", "seguir"] + lasts = ["ultimo", "ultima"] + suffix_lasts = ["passada", "passado", "anterior", "antes"] + nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] + prevs = ["antes", "ante", "previa", "previamente", "anterior"] + froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", + "depois", "por", "proxima", "proximo", "da", "do", "de"] + thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa"] + froms += thises + lists = nxts + prevs + froms + time_indicators + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + + # parse today, tomorrow, yesterday + elif word == "hoje" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "amanha" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "ontem" and not fromFlag: + dayOffset -= 1 + used += 1 + # "before yesterday" and "before before yesterday" + elif (word == "anteontem" or + (word == "ante" and wordNext == "ontem")) and not fromFlag: + dayOffset -= 2 + used += 1 + if wordNext == "ontem": + used += 1 + elif word == "ante" and wordNext == "ante" and wordNextNext == \ + "ontem" and not fromFlag: + dayOffset -= 3 + used += 3 + elif word == "anteanteontem" and not fromFlag: + dayOffset -= 3 + used += 1 + # day after tomorrow + elif word == "depois" and wordNext == "amanha" and not fromFlag: + dayOffset += 2 + used = 2 + # day before yesterday + elif word == "antes" and wordNext == "ontem" and not fromFlag: + dayOffset -= 2 + used = 2 + # parse 5 days, 10 weeks, last week, next week, week after + elif word == "dia": + if wordNext == "depois" or wordNext == "antes": + used += 1 + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used += 1 + elif (wordPrev and wordPrev[0].isdigit() and + wordNext not in months and + wordNext not in monthsShort): + dayOffset += int(wordPrev) + start -= 1 + used += 2 + elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ + months and wordNextNext not in monthsShort: + dayOffset += int(wordNext) + start -= 1 + used += 2 + + elif word == "semana" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + dayOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + dayOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "mes" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + monthOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + monthOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + monthOffset = -7 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "ano" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + for w in nexts: + if wordPrev == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in lasts: + if wordPrev == w: + yearOffset = -7 + start -= 1 + used = 2 + for w in suffix_nexts: + if wordNext == w: + yearOffset = 7 + start -= 1 + used = 2 + for w in suffix_lasts: + if wordNext == w: + yearOffset = -7 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + for w in nexts: + if wordPrev == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in lasts: + if wordPrev == w: + dayOffset -= 7 + used += 1 + start -= 1 + for w in suffix_nexts: + if wordNext == w: + dayOffset += 7 + used += 1 + start -= 1 + for w in suffix_lasts: + if wordNext == w: + dayOffset -= 7 + used += 1 + start -= 1 + if wordNext == "feira": + used += 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + # 13 maio + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + # maio 13 + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordPrevPrev and wordPrevPrev[0].isdigit(): + # 13 dia maio + datestr += " " + wordPrevPrev + + start -= 2 + used += 2 + if wordNext and word[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNextNext and wordNextNext[0].isdigit(): + # maio dia 13 + datestr += " " + wordNextNext + used += 2 + if wordNextNextNext and wordNextNextNext[0].isdigit(): + datestr += " " + wordNextNextNext + used += 1 + hasYear = True + else: + hasYear = False + + if datestr in months: + datestr = "" + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("hoje") + validFollowups.append("amanha") + validFollowups.append("ontem") + validFollowups.append("anteontem") + validFollowups.append("agora") + validFollowups.append("ja") + validFollowups.append("ante") + + # TODO debug word "depois" that one is failing for some reason + if word in froms and wordNext in validFollowups: + + if not (wordNext == "amanha" and wordNext == "ontem") and not ( + word == "depois" or word == "antes" or word == "em"): + used = 2 + fromFlag = True + if wordNext == "amanha" and word != "depois": + dayOffset += 1 + elif wordNext == "ontem": + dayOffset -= 1 + elif wordNext == "anteontem": + dayOffset -= 2 + elif wordNext == "ante" and wordNextNext == "ontem": + dayOffset -= 2 + elif (wordNext == "ante" and wordNextNext == "ante" and + wordNextNextNext == "ontem"): + dayOffset -= 3 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if wordNextNext == "feira": + used += 1 + if tmpOffset < 0: + tmpOffset += 7 + if wordNextNext: + if wordNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNextNextNext: + if wordNextNextNext in nxts: + tmpOffset += 7 + used += 1 + elif wordNextNextNext in prevs: + tmpOffset -= 7 + used += 1 + dayOffset += tmpOffset + if wordNextNextNext == "feira": + used += 1 + if wordNext in months: + used -= 1 + if used > 0: + + if start - 1 > 0 and words[start - 1] in lists: + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in lists: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "meio" and wordNext == "dia": + hrAbs = 12 + used += 2 + elif word == "meia" and wordNext == "noite": + hrAbs = 0 + used += 2 + elif word == "manha": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "tarde": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "meio" and wordNext == "tarde": + if not hrAbs: + hrAbs = 17 + used += 2 + elif word == "meio" and wordNext == "manha": + if not hrAbs: + hrAbs = 10 + used += 2 + elif word == "fim" and wordNext == "tarde": + if not hrAbs: + hrAbs = 19 + used += 2 + elif word == "fim" and wordNext == "manha": + if not hrAbs: + hrAbs = 11 + used += 2 + elif word == "tantas" and wordNext == "manha": + if not hrAbs: + hrAbs = 4 + used += 2 + elif word == "noite": + if not hrAbs: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "hora" and \ + (wordPrev in time_indicators or wordPrevPrev in + time_indicators): + if wordPrev == "meia": + minOffset = 30 + elif wordPrev == "quarto": + minOffset = 15 + elif wordPrevPrev == "quarto": + minOffset = 15 + if idx > 2 and words[idx - 3] in time_indicators: + words[idx - 3] = "" + words[idx - 2] = "" + else: + hrOffset = 1 + if wordPrevPrev in time_indicators: + words[idx - 2] = "" + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif wordNext == "manha": + remainder = "am" + used += 1 + elif wordNext == "tarde": + remainder = "pm" + used += 1 + elif wordNext == "noite": + if 0 < int(word[0]) < 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + elif wordNext in thises and wordNextNext == "manha": + remainder = "am" + used = 2 + elif wordNext in thises and wordNextNext == "tarde": + remainder = "pm" + used = 2 + elif wordNext in thises and wordNextNext == "noite": + remainder = "pm" + used = 2 + else: + if timeQualifier != "": + military = True + if strHH <= 12 and \ + (timeQualifier == "manha" or + timeQualifier == "tarde"): + strHH += 12 + + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if (wordNext == "pm" or + wordNext == "p.m." or + wordNext == "tarde"): + strHH = strNum + remainder = "pm" + used = 1 + elif (wordNext == "am" or + wordNext == "a.m." or + wordNext == "manha"): + strHH = strNum + remainder = "am" + used = 1 + elif (int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" or + wordPrev == "zero" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + elif ( + wordNext == "hora" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # ignores military time + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minuto": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "segundo": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + military = True + if wordNext == "hora": + used += 1 + + elif wordNext == "" or ( + wordNext == "em" and wordNextNext == "ponto"): + strHH = word + strMM = 00 + if wordNext == "em" and wordNextNext == "ponto": + used += 2 + if wordNextNextNext == "tarde": + remainder = "pm" + used += 1 + elif wordNextNextNext == "manha": + remainder = "am" + used += 1 + elif wordNextNextNext == "noite": + if 0 > int(strHH) > 6: + remainder = "am" + else: + remainder = "pm" + used += 1 + + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + military = True + used += 1 + if wordNextNext == "hora": + used += 1 + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if (remainder == "pm" and + 0 < strHH < 12) else strHH + strHH = strHH - 12 if (remainder == "am" and + 0 < strHH >= 12) else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "em" or wordPrev == "ponto": + words[words.index(wordPrev)] = "" + + if idx > 0 and wordPrev in time_indicators: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in time_indicators: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found: + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + en_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', + 'aug', + 'sept', 'oct', 'nov', 'dec'] + for idx, en_month in enumerate(en_months): + datestr = datestr.replace(months[idx], en_month) + for idx, en_month in enumerate(en_monthsShort): + datestr = datestr.replace(monthsShort[idx], en_month) + + temp = datetime.strptime(datestr, "%B %d") + if extractedDate.tzinfo: + temp = temp.replace(tzinfo=extractedDate.tzinfo) + + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if (hrAbs or 0) != -1 and (minAbs or 0) != -1: + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + resultStr = _pt_pruning(resultStr) + return [extractedDate, resultStr] + + +def extract_duration_pt(text): + """ + Convert a portuguese phrase into a number of seconds + Convert things like: + "10 Minutos" + "3 dias 8 horas 10 Minutos e 49 Segundos" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + text = text.lower() + time_units = { + 'microseconds': 'microsegundos', + 'milliseconds': 'milisegundos', + 'seconds': 'segundos', + 'minutes': 'minutos', + 'hours': 'horas', + 'days': 'dias', + 'weeks': 'semanas' + } + # NOTE: some of these english units are spelled wrong on purpose because of the loop below that strips the s + non_std_un = { + "months": "meses", + "years": "anos", + 'decades': "decadas", + 'centurys': "seculos", + 'millenniums': "milenios" + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[s]?" + + text = text.replace("mês", "meses").replace("é", "e") + text = text.replace("segundo", "_s_") # HACK - segundo (second) will be replaced with 2 + text = numbers_to_digits_pt(text) + text = text.replace("_s_", "segundo") # undo HACK + + for (unit_en, unit_pt) in time_units.items(): + unit_pattern = pattern.format( + unit=unit_pt[:-1]) # remove 's' from unit + time_units[unit_en] = 0 + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + for (unit_en, unit_pt) in non_std_un.items(): + unit_pattern = pattern.format( + unit=unit_pt[:-1]) # remove 's' from unit + + def repl_non_std(match): + val = float(match.group(1)) + if unit_en == "months": + val = DAYS_IN_1_MONTH * val + if unit_en == "years": + val = DAYS_IN_1_YEAR * val + if unit_en == "decades": + val = 10 * DAYS_IN_1_YEAR * val + if unit_en == "centurys": + val = 100 * DAYS_IN_1_YEAR * val + if unit_en == "millenniums": + val = 1000 * DAYS_IN_1_YEAR * val + time_units["days"] += val + return '' + + text = re.sub(unit_pattern, repl_non_std, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def numbers_to_digits_pt(utterance: str) -> str: + """ + Replace written numbers in text with their digit equivalents. + + Args: + utterance (str): Input string possibly containing written numbers. + + Returns: + str: Text with written numbers replaced by digits. + """ # TODO - standardize in ovos-number-parser + number_replacements = { + "catorze": "14", + "cem": "100", + "cento": "100", + "cinco": "5", + "cinquenta": "50", + "dez": "10", + "dezanove": "19", + "dezasseis": "16", + "dezassete": "17", + "dezoito": "18", + "dois": "2", + "doze": "12", + "duas": "2", + "duzentas": "200", + "duzentos": "200", + "mil": "1000", + "milhão": "1000000", + "nove": "9", + "novecentas": "900", + "novecentos": "900", + "noventa": "90", + "oitenta": "80", + "oito": "8", + "oitocentas": "800", + "oitocentos": "800", + "onze": "11", + "primeiro": "1", + "quarenta": "40", + "quatro": "4", + "quatrocentas": "400", + "quatrocentos": "400", + "quinhentas": "500", + "quinhentos": "500", + "quinze": "15", + "segundo": "2", + "seis": "6", + "seiscentas": "600", + "seiscentos": "600", + "sessenta": "60", + "sete": "7", + "setecentas": "700", + "setecentos": "700", + "setenta": "70", + "terceiro": "3", + "tres": "3", + "treze": "13", + "trezentas": "300", + "trezentos": "300", + "trinta": "30", + "três": "3", + "um": "1", + "uma": "1", + "vinte": "20", + "zero": "0" + } + words = tokenize(utterance) + for idx, word in enumerate(words): + if word in number_replacements: + words[idx] = number_replacements[word] + return " ".join(words) + + +def tokenize(utterance): + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Split things like amo-te + utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", + utterance) + tokens = utterance.split() + if tokens[-1] == '-': + tokens = tokens[:-1] + + return tokens + + +def _pt_pruning(text, symbols=True, accents=True, agressive=True): + # agressive pt word pruning + words = ["a", "o", "os", "as", "de", "dos", "das", + "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", + "este", + "esta", "deste", "desta", "neste", "nesta", "nesse", + "nessa", "foi", "que"] + if symbols: + symbols = [".", ",", ";", ":", "!", "?", "º", "ª"] + for symbol in symbols: + text = text.replace(symbol, "") + text = text.replace("-", " ").replace("_", " ") + if accents: + accents = {"a": ["á", "à", "ã", "â"], + "e": ["ê", "è", "é"], + "i": ["í", "ì"], + "o": ["ò", "ó"], + "u": ["ú", "ù"], + "c": ["ç"]} + for char in accents: + for acc in accents[char]: + text = text.replace(acc, char) + if agressive: + text_words = text.split(" ") + for idx, word in enumerate(text_words): + if word in words: + text_words[idx] = "" + text = " ".join(text_words) + text = ' '.join(text.split()) + return text diff --git a/ovos_date_parser/dates_ru.py b/ovos_date_parser/dates_ru.py new file mode 100644 index 0000000..96a842c --- /dev/null +++ b/ovos_date_parser/dates_ru.py @@ -0,0 +1,1252 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_ru import pronounce_number_ru, _ORDINAL_BASE_RU, extract_number_ru, \ + _convert_words_to_numbers_ru +from ovos_number_parser.util import is_numeric +from ovos_utils.time import now_local + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_RU = ['январь', 'февраль', 'март', 'апрель', 'май', 'июнь', + 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', + 'декабрь'] + +_TIME_UNITS_CONVERSION = { + 'микросекунд': 'microseconds', + 'милисекунд': 'milliseconds', + 'секунда': 'seconds', + 'секунды': 'seconds', + 'секунд': 'seconds', + 'минута': 'minutes', + 'минуты': 'minutes', + 'минут': 'minutes', + 'година': 'hours', + 'годин': 'hours', + 'години': 'hours', + 'годиною': 'hours', + 'годинами': 'hours', + 'годині': 'hours', + 'час': 'hours', + 'часа': 'hours', + 'часов': 'hours', + 'день': 'days', + 'дня': 'days', + 'дней': 'days', + 'неделя': 'weeks', + 'недели': 'weeks', + 'недель': 'weeks' +} +_WORDS_NEXT_RU = [ + "будущая", "будущее", "будущей", "будущий", "будущим", "будущую", + "новая", "новое", "новой", "новый", "новым", + "следующая", "следующее", "следующей", "следующем", "следующий", "следующую", +] +_WORDS_PREV_RU = [ + "предыдущая", "предыдущем", "предыдущей", "предыдущий", "предыдущим", "предыдущую", + "прошедшая", "прошедшем", "прошедшей", "прошедший", "прошедшим", "прошедшую", + "прошлая", "прошлой", "прошлом", "прошлую", "прошлый", "прошлым", + "том", "тот", +] +_WORDS_CURRENT_RU = [ + "данная", "данное", "данном", "данный", + "настойщая", "настоящее", "настойщем", "настойщем", "настойщий", + "нынешняя", "нынешнее", "нынешней", "нынешнем", "нынешний", + "текущая", "текущее", "текущей", "текущем", "текущий", + "это", "этим", "этой", "этом", "этот", "эту", +] +_WORDS_NOW_RU = [ + "теперь", + "сейчас", +] +_WORDS_MORNING_RU = ["утро", "утром"] +_WORDS_DAY_RU = ["днём"] +_WORDS_EVENING_RU = ["вечер", "вечером"] +_WORDS_NIGHT_RU = ["ночь", "ночью"] + + +def nice_time_ru(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if dt.hour < 4: + string += " ночи" + elif dt.hour < 12: + string += " утра" + elif dt.hour < 18: + string += " дня" + else: + string += " вечера" + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_hour_ru(int(string[0])) + " " + speak += pronounce_number_ru(int(string[1])) + else: + speak = pronounce_hour_ru(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "ровно" + else: + if string[3] == '0': + speak += pronounce_number_ru(0) + " " + speak += pronounce_number_ru(int(string[4])) + else: + speak += pronounce_number_ru(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "полночь" + elif dt.hour == 12 and dt.minute == 0: + return "полдень" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_hour_ru(hour) + " с четвертью" + elif dt.minute == 30: + speak = pronounce_hour_ru(hour) + " с половиной" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "без четверти " + pronounce_hour_ru(next_hour) + else: + speak = pronounce_hour_ru(hour) + + if dt.minute == 0: + if not use_ampm: + if dt.hour % 12 == 1: + return speak + return speak + " " + plural_ru(dt.hour % 12, "час", "часа", "часов") + else: + if dt.minute < 10: + speak += " ноль" + speak += " " + pronounce_number_ru(dt.minute) + + if use_ampm: + if dt.hour < 4: + speak += " ночи" + elif dt.hour < 12: + speak += " утра" + elif dt.hour < 18: + speak += " дня" + else: + speak += " вечера" + + return speak + + +def nice_duration_ru(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + if not speech: + raise NotImplementedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + + if days > 0: + out += pronounce_number_ru(days) + out += " " + plural_ru(days, "день", "дня", "дней") + if hours > 0: + if out: + out += " " + out += pronounce_number_ru(hours) + out += " " + plural_ru(hours, "час", "часа", "часов") + if minutes > 0: + if out: + out += " " + out += pronounce_number_feminine_ru(minutes) + out += " " + plural_ru(minutes, "минута", "минуты", "минут") + if seconds > 0: + if out: + out += " " + out += pronounce_number_feminine_ru(seconds) + out += " " + plural_ru(seconds, "секунда", "секунды", "секунд") + + return out + + +def pronounce_hour_ru(num): + if num == 1: + return "час" + return pronounce_number_ru(num) + + +def extract_duration_ru(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Russian inflection for time: минута, минуты, минут - safe to use минута as pattern + # For day: день, дня, дней - short pattern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}(?:а|ов|у|ут|уту)?" + text = _convert_words_to_numbers_ru(text) + + for (unit_ru, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_ru) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return duration, text + + +def extract_datetime_ru(text, anchor_date=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchor_date (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize Russian inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("сегодня вечером", "вечером") \ + .replace("сегодня ночью", "ночью") + word_list = s.split() + + for idx, word in enumerate(word_list): + # word = word.replace("'s", "") + ########## + # Russian Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: тридцать первого > 31 + count_ordinals = 0 + if word == "первого": + count_ordinals = 1 # These two have different format + elif word == "третьего": + count_ordinals = 3 + elif word.endswith("ого"): + tmp = word[:-3] + tmp += "ый" + for nr, name in _ORDINAL_BASE_RU.items(): + if name == tmp: + count_ordinals = nr + + # If number is bigger than 19 check if next word is also ordinal + # and count them together + if count_ordinals > 19: + if word_list[idx + 1] == "первого": + count_ordinals += 1 # These two have different format + elif word_list[idx + 1] == "третьего": + count_ordinals += 3 + elif word_list[idx + 1].endswith("ого"): + tmp = word_list[idx + 1][:-3] + tmp += "ый" + for nr, name in _ORDINAL_BASE_RU.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized value into word + if count_ordinals > 20: + # If counted number is greater than 20, clear next word so it is not used again + word_list[idx + 1] = "" + ########## + # Remove inflection from Russian months + + word_list[idx] = word + + return word_list + + def date_found(): + return found or \ + ( + date_string != "" or + year_offset != 0 or month_offset != 0 or + day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or + min_abs or sec_offset != 0 + ) + + if text == "": + return None + + anchor_date = anchor_date or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchor_date.strftime("%w") + current_year = anchor_date.strftime("%Y") + from_flag = False + date_string = "" + has_year = False + time_qualifier = "" + + time_qualifiers_am = _WORDS_MORNING_RU + time_qualifiers_pm = ['дня', 'вечера'] + time_qualifiers_pm.extend(_WORDS_DAY_RU) + time_qualifiers_pm.extend(_WORDS_EVENING_RU) + time_qualifiers_pm.extend(_WORDS_NIGHT_RU) + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['на', 'в', 'во', 'до', 'на', 'это', + 'около', 'этот', 'через', 'спустя', 'за', 'тот'] + days = ['понедельник', 'вторник', 'среда', + 'четверг', 'пятница', 'суббота', 'воскресенье'] + months = _MONTHS_RU + recur_markers = days + ['выходные', 'викенд'] + months_short = ['янв', 'фев', 'мар', 'апр', 'май', 'июн', 'июл', 'авг', + 'сен', 'окт', 'ноя', 'дек'] + year_multiples = ["десятилетие", "век", "тысячелетие"] + + words = clean_string(text) + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_ru_inflection_normalize(word, 2) + word_prev_prev = _text_ru_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_ru_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_ru_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_ru_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + if word in _WORDS_NOW_RU and not date_string: + result_str = " ".join(words[idx + 1:]) + result_str = ' '.join(result_str.split()) + extracted_date = anchor_date.replace(microsecond=0) + return [extracted_date, result_str] + elif word_next in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_ru(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if word_next == "десятилетие": + year_offset = multiplier * 10 + elif word_next == "век": + year_offset = multiplier * 100 + elif word_next == "тысячелетие": + year_offset = multiplier * 1000 + elif word in time_qualifiers_list and preposition != "через" and word_next != "назад": + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "сегодня" and not from_flag: + day_offset = 0 + used += 1 + elif word == "завтра" and not from_flag: + day_offset = 1 + used += 1 + elif word == "послезавтра" and not from_flag: + day_offset = 2 + used += 1 + elif word == "после" and word_next == "завтра" and not from_flag: + day_offset = 2 + used += 2 + elif word == "позавчера" and not from_flag: + day_offset = -2 + used += 1 + elif word == "вчера" and not from_flag: + day_offset = -1 + used += 1 + elif (word in ["день", "дня"] and + word_next == "после" and + word_next_next == "завтра" and + not from_flag and + (not word_prev or not word_prev[0].isdigit())): + day_offset = 2 + used = 2 + elif word in ["день", "дня"] and is_numeric(word_prev) and preposition == "через": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word in ["день", "дня"] and is_numeric(word_prev) and word_next == "назад": + if word_prev and word_prev[0].isdigit(): + day_offset += -int(word_prev) + start -= 1 + used = 3 + elif word == "сегодня" and not from_flag and word_prev: + if word_prev[0].isdigit(): + day_offset += int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + day_offset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "неделя" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + day_offset = int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + day_offset = -7 + start -= 1 + used = 2 + elif word == "месяц" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + month_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + month_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + month_offset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "год" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + year_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_RU: + year_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_RU: + year_offset = -1 + start -= 1 + used = 2 + elif word_prev == "через": + year_offset = 1 + used = 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + d = days.index(word) + day_offset = (d + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev in _WORDS_NEXT_RU: + if day_offset <= 2: + day_offset += 7 + used += 1 + start -= 1 + elif word_prev in _WORDS_PREV_RU: + day_offset -= 7 + used += 1 + start -= 1 + elif word in months or word in months_short and not from_flag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + # Convert Russian months to english + date_string = _MONTHS_CONVERSION.get(m) + if word_prev and (word_prev[0].isdigit() or + (word_prev == " " and word_prev_prev[0].isdigit())): + if word_prev == " " and word_prev_prev[0].isdigit(): + date_string += " " + words[idx - 2] + used += 1 + start -= 1 + else: + date_string += " " + word_prev + start -= 1 + used += 1 + if word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + has_year = True + else: + has_year = False + + elif word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + date_string += " " + word_next_next + used += 1 + has_year = True + else: + has_year = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + valid_followups = days + months + months_short + valid_followups.append("сегодня") + valid_followups.append("завтра") + valid_followups.append("послезавтра") + valid_followups.append("вчера") + valid_followups.append("позавчера") + for followup in _WORDS_NEXT_RU: + valid_followups.append(followup) + for followup in _WORDS_PREV_RU: + valid_followups.append(followup) + for followup in _WORDS_CURRENT_RU: + valid_followups.append(followup) + for followup in _WORDS_NOW_RU: + valid_followups.append(followup) + if (word in ["до", "по", "от", "с", "со"]) and word_next in valid_followups: + used = 2 + from_flag = True + if word_next == "завтра": + day_offset += 1 + elif word_next == "послезавтра": + day_offset += 2 + elif word_next == "вчера": + day_offset -= 1 + elif word_next == "позавчера": + day_offset -= 2 + elif word_next in days: + d = days.index(word_next) + tmp_offset = (d + 1) - int(today) + used = 2 + if tmp_offset < 0: + tmp_offset += 7 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + d = days.index(word_next_next) + tmp_offset = (d + 1) - int(today) + used = 3 + if word_next in _WORDS_NEXT_RU: + if day_offset <= 2: + tmp_offset += 7 + used += 1 + start -= 1 + elif word_next in _WORDS_PREV_RU: + tmp_offset -= 7 + used += 1 + start -= 1 + day_offset += tmp_offset + if used > 0: + if start - 1 > 0 and (words[start - 1] in _WORDS_CURRENT_RU): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + day_specified = True + + # parse time + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_ru_inflection_normalize(word, 2) + word_prev_prev = _text_ru_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_ru_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_ru_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_ru_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "полдень": + hr_abs = 12 + used += 1 + elif word == "полночь": + hr_abs = 0 + used += 1 + elif word in _WORDS_MORNING_RU: + if hr_abs is None: + hr_abs = 8 + used += 1 + elif word in _WORDS_DAY_RU: + if hr_abs is None: + hr_abs = 15 + used += 1 + elif word in _WORDS_EVENING_RU: + if hr_abs is None: + hr_abs = 19 + used += 1 + if word_next != "" and word_next[0].isdigit() and ":" in word_next: + used -= 1 + elif word in _WORDS_NIGHT_RU: + if hr_abs is None: + hr_abs = 22 + # parse half an hour, quarter hour + elif word == "час" and \ + (word_prev in markers or word_prev_prev in markers): + if word_prev in ["пол", "половина"]: + min_offset = 30 + elif word_prev == "четверть": + min_offset = 15 + elif word_prev == "через": + hr_offset = 1 + else: + hr_offset = 1 + if word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_RU: + day_specified = True + words[idx - 1] = "" + used += 1 + hr_abs = -1 + min_abs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "минута" and word_prev == "через": + min_offset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "секунда" and word_prev == "через": + sec_offset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + is_time = True + str_hh = "" + str_mm = "" + remainder = "" + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + if word_next in _WORDS_EVENING_RU or word_next in _WORDS_NIGHT_RU or word_next_next in _WORDS_EVENING_RU \ + or word_next_next in _WORDS_NIGHT_RU or word_prev in _WORDS_EVENING_RU \ + or word_prev in _WORDS_NIGHT_RU or word_prev_prev in _WORDS_EVENING_RU \ + or word_prev_prev in _WORDS_NIGHT_RU or word_next_next_next in _WORDS_EVENING_RU \ + or word_next_next_next in _WORDS_NIGHT_RU: + remainder = "pm" + used += 1 + if word_prev in _WORDS_EVENING_RU or word_prev in _WORDS_NIGHT_RU: + words[idx - 1] = "" + if word_prev_prev in _WORDS_EVENING_RU or word_prev_prev in _WORDS_NIGHT_RU: + words[idx - 2] = "" + if word_next_next in _WORDS_EVENING_RU or word_next_next in _WORDS_NIGHT_RU: + used += 1 + if word_next_next_next in _WORDS_EVENING_RU or word_next_next_next in _WORDS_NIGHT_RU: + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + str_hh += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + str_mm += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + next_word = word_next.replace(".", "") + if next_word in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = next_word + used += 1 + elif next_word == "часа" and word_next_next in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = word_next_next + used += 2 + elif word_next in _WORDS_MORNING_RU: + remainder = "am" + used += 2 + elif word_next in _WORDS_DAY_RU: + remainder = "pm" + used += 2 + elif word_next in _WORDS_EVENING_RU: + remainder = "pm" + used += 2 + elif word_next == "этого" and word_next_next in _WORDS_MORNING_RU: + remainder = "am" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_DAY_RU: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_EVENING_RU: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "в" and word_next_next in _WORDS_NIGHT_RU: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif hr_abs and hr_abs != -1: + if hr_abs >= 12: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if time_qualifier != "": + military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh += str(int(str_hh) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + str_num = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + str_num += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = word_next.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечера" or + word_next == "вечера"): + str_hh = str_num + remainder = "pm" + used = 1 + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечера" or + word_next == "вечера"): + str_hh = str_num + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + word_next == "am" or + remainder == "a.m." or + word_next == "a.m." or + remainder == "ночи" or + word_next == "ночи" or + remainder == "утра" or + word_next == "утра"): + str_hh = str_num + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + word_next in recur_markers or + word_next_next in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set str_hh so that is_time == True + # when am or pm is not specified + str_hh = str_num + used = 1 + else: + if int(str_num) > 100: + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час": + used += 1 + elif ( + (word_next == "час" or + remainder == "час") and + word[0] != '0' and + # (wordPrev != "в" and wordPrev != "на") + word_prev == "через" + and + ( + int(str_num) < 100 or + int(str_num) > 2400 + )): + # ignores military time + # "in 3 hours" + hr_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "минута" or \ + remainder == "минута": + # "in 10 minutes" + min_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "секунда" \ + or remainder == "секунда": + # in 5 seconds + sec_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif int(str_num) > 100: + # military time, eg. "3300 hours" + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час" or \ + remainder == "час": + used += 1 + elif word_next and word_next[0].isdigit(): + # military time, e.g. "04 38 hours" + str_hh = str_num + str_mm = word_next + military = True + used += 1 + if (word_next_next == "час" or + remainder == "час"): + used += 1 + elif ( + word_next == "" or word_next == "час" or + ( + (word_next == "в" or word_next == "на") and + ( + word_next_next == time_qualifier + ) + ) or word_next in _WORDS_EVENING_RU or + word_next_next in _WORDS_EVENING_RU): + + str_hh = str_num + str_mm = "00" + if word_next == "час": + used += 1 + if (word_next == "в" or word_next == "на" + or word_next_next == "в" or word_next_next == "на"): + used += (1 if (word_next == + "в" or word_next == "на") else 2) + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (word_next_next and + (word_next_next in time_qualifier or + word_next_next_next in time_qualifier)): + if (word_next_next in time_qualifiers_pm or + word_next_next_next in time_qualifiers_pm): + remainder = "pm" + used += 1 + if (word_next_next in time_qualifiers_am or + word_next_next_next in time_qualifiers_am): + remainder = "am" + used += 1 + + if time_qualifier != "": + if time_qualifier in time_qualifiers_pm: + remainder = "pm" + used += 1 + + elif time_qualifier in time_qualifiers_am: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "час": + if word_next_next in ["ночи", "утра"]: + remainder = "am" + used += 1 + elif word_next_next in ["дня", "вечера"]: + remainder = "pm" + used += 1 + else: + remainder = "" + + else: + is_time = False + hh = int(str_hh) if str_hh else 0 + mm = int(str_mm) if str_mm else 0 + hh = hh + 12 if remainder == "pm" and hh < 12 else hh + hh = hh - 12 if remainder == "am" and hh >= 12 else hh + if (not military and + remainder not in ['am', 'pm', 'час', 'минута', 'секунда'] and + ((not day_specified) or 0 <= day_offset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchor_date.hour < hh or (anchor_date.hour == hh and + anchor_date.minute < mm): + pass # No modification needed + elif anchor_date.hour < hh + 12: + hh += 12 + else: + # has passed, assume the next morning + day_offset += 1 + if time_qualifier in time_qualifiers_pm and hh < 12: + hh += 12 + + if hh > 24 or mm > 59: + is_time = False + used = 0 + if is_time: + hr_abs = hh + min_abs = mm + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if word_prev == "скоро": + hr_offset = -1 + words[idx - 1] = "" + idx -= 1 + elif word_prev == "позже": + hr_offset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and word_prev in markers: + words[idx - 1] = "" + if word_prev in _WORDS_CURRENT_RU: + day_specified = True + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_RU: + day_specified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchor_date.replace(microsecond=0) + if date_string != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(date_string, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(date_string, "%B %d %Y") + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and date_string == "": + if not day_specified and anchor_date > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + for idx, word in enumerate(words): + if words[idx] == "и" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + result_str = " ".join(words) + result_str = ' '.join(result_str.split()) + return [extracted_date, result_str] + + +def _text_ru_inflection_normalize(word, arg): + """ + Russian Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + if word in ["тысяч", "тысячи"]: + return "тысяча" + + if arg == 1: # _extract_whole_number_with_text_ru + if word in ["одна", "одним", "одно", "одной"]: + return "один" + if word == "две": + return "два" + if word == "пару": + return "пара" + + elif arg == 2: # extract_datetime_ru + if word in ["часа", "часам", "часами", "часов", "часу"]: + return "час" + if word in ["минут", "минутам", "минутами", "минуту", "минуты"]: + return "минута" + if word in ["секунд", "секундам", "секундами", "секунду", "секунды"]: + return "секунда" + if word in ["дней", "дни"]: + return "день" + if word in ["неделе", "недели", "недель"]: + return "неделя" + if word in ["месяца", "месяцев"]: + return "месяц" + if word in ["года", "лет"]: + return "год" + if word in _WORDS_MORNING_RU: + return "утром" + if word in ["полудне", "полудня"]: + return "полдень" + if word in _WORDS_EVENING_RU: + return "вечером" + if word in _WORDS_NIGHT_RU: + return "ночь" + if word in ["викенд", "выходным", "выходных"]: + return "выходные" + if word in ["столетие", "столетий", "столетия"]: + return "век" + + # Week days + if word in ["среду", "среды"]: + return "среда" + if word in ["пятницу", "пятницы"]: + return "пятница" + if word in ["субботу", "субботы"]: + return "суббота" + + # Months + if word in ["марта", "марте"]: + return "март" + if word in ["мае", "мая"]: + return "май" + if word in ["августа", "августе"]: + return "август" + + if word[-2:] in ["ле", "ля", "не", "ня", "ре", "ря"]: + tmp = word[:-1] + "ь" + for name in _MONTHS_RU: + if name == tmp: + return name + + return word + + +def pronounce_number_feminine_ru(num): + pronounced = pronounce_number_ru(num) + + num %= 100 + if num % 10 == 1 and num // 10 != 1: + return pronounced[:-2] + "на" + elif num % 10 == 2 and num // 10 != 1: + return pronounced[:-1] + "е" + + return pronounced + + +def plural_ru(num: int, one: str, few: str, many: str): + num %= 100 + if num // 10 == 1: + return many + if num % 10 == 1: + return one + if 2 <= num % 10 <= 4: + return few + return many diff --git a/ovos_date_parser/dates_sv.py b/ovos_date_parser/dates_sv.py new file mode 100644 index 0000000..ea5d815 --- /dev/null +++ b/ovos_date_parser/dates_sv.py @@ -0,0 +1,799 @@ +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_sv import pronounce_number_sv, _find_numbers_in_text, _combine_adjacent_numbers +from ovos_number_parser.util import tokenize +from ovos_utils.time import now_local + + +def nice_time_sv(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + + For example, generate 'five thirty' for speech or '5:30' for + text display. + + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + if use_24hour: + if dt.hour == 1: + speak += "ett" # 01:00 is "ett" not "en" + else: + speak += pronounce_number_sv(dt.hour) + if not dt.minute == 0: + if dt.minute < 10: + speak += ' noll' + + if dt.minute == 1: + speak += ' ett' + else: + speak += " " + pronounce_number_sv(dt.minute) + + return speak # ampm is ignored when use_24hour is true + else: + hour = dt.hour + + if not dt.minute == 0: + if dt.minute < 30: + if dt.minute != 15: + speak += pronounce_number_sv(dt.minute) + else: + speak += 'kvart' + + if dt.minute == 1: + speak += ' minut över ' + elif dt.minute != 10 and dt.minute != 5 and dt.minute != 15: + speak += ' minuter över ' + else: + speak += ' över ' + elif dt.minute > 30: + if dt.minute != 45: + speak += pronounce_number_sv((60 - dt.minute)) + else: + speak += 'kvart' + + if dt.minute == 1: + speak += ' minut i ' + elif dt.minute != 50 and dt.minute != 55 and dt.minute != 45: + speak += ' minuter i ' + else: + speak += ' i ' + + hour = (hour + 1) % 12 + elif dt.minute == 30: + speak += 'halv ' + hour = (hour + 1) % 12 + + if hour == 0 and dt.minute == 0: + return "midnatt" + if hour == 12 and dt.minute == 0: + return "middag" + # TODO: "half past 3", "a quarter of 4" and other idiomatic times + + if hour == 0: + speak += pronounce_number_sv(12) + elif hour <= 13: + if hour == 1 or hour == 13: # 01:00 and 13:00 is "ett" + speak += 'ett' + else: + speak += pronounce_number_sv(hour) + else: + speak += pronounce_number_sv(hour - 12) + + if use_ampm: + if dt.hour > 11: + if dt.hour < 18: + # 12:01 - 17:59 nachmittags/afternoon + speak += " på eftermiddagen" + elif dt.hour < 22: + # 18:00 - 21:59 abends/evening + speak += " på kvällen" + else: + # 22:00 - 23:59 nachts/at night + speak += " på natten" + elif dt.hour < 3: + # 00:01 - 02:59 nachts/at night + speak += " på natten" + else: + # 03:00 - 11:59 morgens/in the morning + speak += " på morgonen" + + return speak + + +def extract_datetime_sv(text, anchorDate=None, default_time=None): + def clean_string(s): + """ + cleans the input string of unneeded punctuation and capitalization + among other things. + """ + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace(' den ', ' ').replace(' en ', ' ') + wordList = s.split() + for idx, word in enumerate(wordList): + word = word.replace("'s", "") + + ordinals = ["rd", "st", "nd", "th"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or timeStr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if text == "": + return None + + anchorDate = anchorDate or now_local() + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + dateNow = anchorDate + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersList = ['morgon', 'förmiddag', 'eftermiddag', 'kväll'] + markers = ['på', 'i', 'den här', 'kring', 'efter'] + days = ['måndag', 'tisdag', 'onsdag', 'torsdag', + 'fredag', 'lördag', 'söndag'] + months = ['januari', 'februari', 'mars', 'april', 'maj', 'juni', + 'juli', 'augusti', 'september', 'oktober', 'november', + 'december'] + monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', + 'sept', 'oct', 'nov', 'dec'] + + words = clean_string(text) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + word = word.rstrip('s') + start = idx + used = 0 + # save timequalifier for later + if word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "idag" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "imorgon" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "morgondagen" or word == "morgondagens" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "övermorgon" and not fromFlag: + dayOffset = 2 + used += 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "dag" or word == "dagar": + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "vecka" or word == "veckor" and not fromFlag: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "nästa": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "förra": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "månad" and not fromFlag: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "nästa": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "förra": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "år" and not fromFlag: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "nästa": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "förra": + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "nästa": + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "förra": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = months[m] + if wordPrev and (wordPrev[0].isdigit() or + (wordPrev == "of" and wordPrevPrev[0].isdigit())): + if wordPrev == "of" and wordPrevPrev[0].isdigit(): + datestr += " " + words[idx - 2] + used += 1 + start -= 1 + else: + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + months + monthsShort + validFollowups.append("idag") + validFollowups.append("imorgon") + validFollowups.append("nästa") + validFollowups.append("förra") + validFollowups.append("nu") + if (word == "från" or word == "efter") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "imorgon": + dayOffset += 1 + elif wordNext in days: + d = days.index(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = days.index(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "nästa": + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "förra": + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "denna": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + timeStr = "" + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "middag": + hrAbs = 12 + used += 1 + elif word == "midnatt": + hrAbs = 0 + used += 1 + elif word == "morgon": + if not hrAbs: + hrAbs = 8 + used += 1 + elif word == "förmiddag": + if not hrAbs: + hrAbs = 10 + used += 1 + elif word == "eftermiddag": + if not hrAbs: + hrAbs = 15 + used += 1 + elif word == "kväll": + if not hrAbs: + hrAbs = 19 + used += 1 + # parse half an hour, quarter hour + elif wordPrev in markers or wordPrevPrev in markers: + if word == "halvtimme" or word == "halvtimma": + minOffset = 30 + elif word == "kvart": + minOffset = 15 + elif word == "timme" or word == "timma": + hrOffset = 1 + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "am" or nextWord == "pm": + remainder = nextWord + used += 1 + elif nextWord == "tonight": + remainder = "pm" + used += 1 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "morning": + remainder = "am" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "afternoon": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "the" and \ + words[idx + 3] == "evening": + remainder = "pm" + used += 3 + elif wordNext == "in" and wordNextNext == "morning": + remainder = "am" + used += 2 + elif wordNext == "in" and wordNextNext == "afternoon": + remainder = "pm" + used += 2 + elif wordNext == "in" and wordNextNext == "evening": + remainder = "pm" + used += 2 + elif wordNext == "this" and wordNextNext == "morning": + remainder = "am" + used = 2 + elif wordNext == "this" and wordNextNext == "afternoon": + remainder = "pm" + used = 2 + elif wordNext == "this" and wordNextNext == "evening": + remainder = "pm" + used = 2 + elif wordNext == "at" and wordNextNext == "night": + if strHH > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + else: + if timeQualifier != "": + if strHH <= 12 and \ + (timeQualifier == "evening" or + timeQualifier == "afternoon"): + strHH += 12 + else: + # try to parse # s without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + else: + if wordNext == "pm" or wordNext == "p.m.": + strHH = strNum + remainder = "pm" + used = 1 + elif wordNext == "am" or wordNext == "a.m.": + strHH = strNum + remainder = "am" + used = 1 + elif ( + int(word) > 100 and + ( + wordPrev == "o" or + wordPrev == "oh" + )): + # 0800 hours (pronounced oh-eight-hundred) + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hours": + used += 1 + elif ( + wordNext == "hours" and + word[0] != '0' and + ( + int(word) < 100 and + int(word) > 2400 + )): + # "in 3 hours" + hrOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + + elif wordNext == "minutes": + # "in 10 minutes" + minOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext == "seconds": + # in 5 seconds + secOffset = int(word) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(word) > 100: + strHH = int(word) / 100 + strMM = int(word) - strHH * 100 + if wordNext == "hours": + used += 1 + elif wordNext[0].isdigit(): + strHH = word + strMM = wordNext + used += 1 + if wordNextNext == "hours": + used += 1 + elif ( + wordNext == "" or wordNext == "o'clock" or + ( + wordNext == "in" and + ( + wordNextNext == "the" or + wordNextNext == timeQualifier + ) + )): + strHH = word + strMM = 00 + if wordNext == "o'clock": + used += 1 + if wordNext == "in" or wordNextNext == "in": + used += (1 if wordNext == "in" else 2) + if (wordNextNext and + wordNextNext in timeQualifier or + (words[words.index(wordNextNext) + 1] and + words[words.index(wordNextNext) + 1] in + timeQualifier)): + if (wordNextNext == "afternoon" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "afternoon")): + remainder = "pm" + if (wordNextNext == "evening" or + (len(words) > + (words.index(wordNextNext) + 1) and + words[words.index( + wordNextNext) + 1] == "evening")): + remainder = "pm" + if (wordNextNext == "morning" or + (len(words) > + words.index(wordNextNext) + 1 and + words[words.index( + wordNextNext) + 1] == "morning")): + remainder = "am" + else: + isTime = False + + strHH = int(strHH) if strHH else 0 + strMM = int(strMM) if strMM else 0 + strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH + strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if strHH > 24 or strMM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = strHH * 1 + minAbs = strMM * 1 + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + words[idx + i] = "" + + if wordPrev == "o" or wordPrev == "oh": + words[words.index(wordPrev)] = "" + + if wordPrev == "early": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "late": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + + idx += used - 1 + found = True + + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow + extractedDate = extractedDate.replace(microsecond=0, + second=0, + minute=0, + hour=0) + if datestr != "": + temp = datetime.strptime(datestr, "%B %d") + if not hasYear: + temp = temp.replace(year=extractedDate.year) + if extractedDate < temp: + extractedDate = extractedDate.replace(year=int(currentYear), + month=int( + temp.strftime( + "%m")), + day=int(temp.strftime( + "%d"))) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d"))) + + if timeStr != "": + temp = datetime(timeStr) + extractedDate = extractedDate.replace(hour=temp.strftime("%H"), + minute=temp.strftime("%M"), + second=temp.strftime("%S")) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + + if hrAbs is None and minAbs is None and default_time: + hrAbs = default_time.hour + minAbs = default_time.minute + if hrAbs != -1 and minAbs != -1: + extractedDate = extractedDate + relativedelta(hours=hrAbs or 0, + minutes=minAbs or 0) + if (hrAbs or minAbs) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "and" and words[idx - 1] == "" and words[ + idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def extract_duration_sv(text): + """ + Convert a swedish phrase into a number of seconds. + + The function handles durations from seconds up to days. + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + tokens = tokenize(text) + number_tok_map = _find_numbers_in_text(tokens) + # Combine adjacent numbers + simplified = _combine_adjacent_numbers(number_tok_map) + + states = { + 'days': 0, + 'hours': 0, + 'minutes': 0, + 'seconds': 0 + } + + # Parser state, mapping words that should set the parser to collect + # numbers to a specific time "size" + state_words = { + 'days': ('dygn', 'dag', 'dagar', 'dags'), + 'hours': ('timmar', 'timme', 'timma', 'timmes', 'timmas'), + 'minutes': ('minuter', 'minuters', 'minut', 'minuts'), + 'seconds': ('sekunder', 'sekunders', 'sekund', 'sekunds') + } + binding_words = ('och') + + consumed = [] + state = None + valid = False + + for num, toks in simplified: + if state and num: + states[state] += num + consumed.extend(toks) + valid = True # If a state field got set this is valid duration + elif num is None: + for s in state_words: + if toks[0].word in state_words[s]: + state = s + consumed.extend(toks) + break + else: + if toks[0].word not in binding_words: + state = None + + td = timedelta(**states) + remainder = ' '.join([t.word for t in tokens if t not in consumed]) + return (td, remainder) if valid else None diff --git a/ovos_date_parser/dates_uk.py b/ovos_date_parser/dates_uk.py new file mode 100644 index 0000000..f092f45 --- /dev/null +++ b/ovos_date_parser/dates_uk.py @@ -0,0 +1,1464 @@ +import re +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta +from ovos_number_parser.numbers_uk import extract_number_uk, _convert_words_to_numbers_uk, _ORDINAL_BASE_UK, pronounce_number_uk, \ + _NUM_STRING_UK +from ovos_number_parser.util import invert_dict, is_numeric +from ovos_utils.time import now_local + +# hours +HOURS_UK = { + 1: 'перша', + 2: 'друга', + 3: 'третя', + 4: 'четверта', + 5: "п'ята", + 6: 'шоста', + 7: 'сьома', + 8: 'восьма', + 9: "дев'ята", + 10: 'десята', + 11: 'одинадцята', + 12: 'дванадцята' +} +# Months + +_MONTHS_CONVERSION = { + 0: "january", + 1: "february", + 2: "march", + 3: "april", + 4: "may", + 5: "june", + 6: "july", + 7: "august", + 8: "september", + 9: "october", + 10: "november", + 11: "december" +} + +_MONTHS_UK = ["січень", "лютий", "березень", "квітень", "травень", "червень", + "липень", "серпень", "вересень", "жовтень", "листопад", + "грудень"] + +# Time +_TIME_UNITS_CONVERSION = { + "мікросекунд": "microseconds", + "мілісекунд": "milliseconds", + "секунда": "seconds", + "секунди": "seconds", + "секунд": "seconds", + "секунду": "seconds", + "хвилина": "minutes", + "хвилини": "minutes", + "хвилин": "minutes", + "хвилину": "minutes", + "година": "hours", + "годин": "hours", + "години": "hours", + "годину": "hours", + "годинами": "hours", + "годиною": "hours", + "днів": "days", + "день": "days", + "дні": "days", + "дня": "days", + "тиждень": "weeks", + "тижня": "weeks", + "тижні": "weeks", + "тижнів": "weeks" +} + +_WORDS_NEXT_UK = [ + "майбутня", "майбутнє", "майбутній", "майбутньому", "майбутнім", "майбутньої", "майбутнього", + "нова", "нове", "новий", "нового", "нової", "новим", "новою", "через", + "наступна", "наступне", "наступний", "наступній", "наступному", "наступним", "наступною", +] +_WORDS_PREV_UK = [ + "попередня", "попередній", "попереднім", "попередньої", + "попередню", "попереднього", "попередне", "тому", + "минула", "минулий", "минуле", "минулу", "минулого", "минулій", "минулому", + "минулої", "минулою", "минулим", + "та", "той", "ті", "те", "того", +] +_WORDS_CURRENT_UK = [ + "теперішній", "теперішня", "теперішні", "теперішній", "теперішньому", + "теперішньою", "теперішнім", "теперішнього", "теперішньої", + "дана", "даний", "дане", "даним", "даною", "даного", "даної", "даному", "даній", + "поточний", "поточна", "поточні", "поточне", "поточного", "поточної", + "поточному", "поточній", "поточним", "поточною", + "нинішній", "нинішня", "нинішнє", "нинішньому", "нинішній", + "нинішнього", "нинішньої", "нинішнім", "нинішньою", + "цей", "ця", "це", "цим", "цією", "цьому", "цій" +] +_WORDS_NOW_UK = [ + "тепер", + "зараз", +] +_WORDS_MORNING_UK = ["ранок", "зранку", "вранці", "ранку"] +_WORDS_DAY_UK = ["вдень", "опівдні"] +_WORDS_EVENING_UK = ["вечер", "ввечері", "увечері", "вечором"] +_WORDS_NIGHT_UK = ["ніч", "вночі"] +_PLURALS = { + 'двох': 2, 'двум': 2, 'двома': 2, 'дві': 2, "двоє": 2, "двійка": 2, + 'обидва': 2, 'обидвох': 2, 'обидві': 2, 'обох': 2, 'обома': 2, 'обом': 2, + 'пара': 2, 'пари': 2, 'парою': 2, 'парами': 2, 'парі': 2, 'парах': 2, 'пару': 2, + 'трьох': 3, 'трьома': 3, 'трьом': 3, + 'чотирьох': 4, 'чотирьом': 4, 'чотирма': 4, + "п'ятьох": 5, "п'ятьом": 5, "п'ятьома": 5, + "шістьом": 6, "шести": 6, "шістьох": 6, "шістьма": 6, "шістьома": 6, + "семи": 7, "сімом": 7, "сімох": 7, "сімома": 7, "сьома": 7, + "восьми": 8, "вісьмох": 8, "вісьмом": 8, "вісьма": 8, "вісьмома": 8, + "дев'яти": 9, "дев'ятьох": 9, "дев'ятьом": 9, "дев'ятьма": 9, + "десяти": 10, "десятьох": 10, "десятьма": 10, "десятьома": 10, + "сорока": 40, + "сот": 100, "сотень": 100, "сотні": 100, "сотня": 100, + "двохсот": 200, "двомстам": 200, "двомастами": 200, "двохстах": 200, + "тисяч": 1000, "тисячі": 1000, "тисячу": 1000, "тисячах": 1000, + "тисячами": 1000, "тисячею": 1000 +} + + +def generate_plurals_uk(originals): + """ + Return a new set or dict containing the plural form of the original values, + Generate different cases of values + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + suffixes = ["а", "ах", "их", "ам", "ами", "ів", + "ям", "ох", "и", "на", "ни", "і", "ні", + "ий", "ний", 'ьох', 'ьома', 'ьом', 'ох', + 'ум', 'ма', 'ом'] + if isinstance(originals, dict): + thousand = {"тисяч": 1000, "тисячі": 1000, "тисячу": 1000, "тисячах": 1000} + hundred = {"сотня": 100, "сотні": 100, "сотень": 100} + result_dict = {key + suffix: value for key, value in originals.items() for suffix in suffixes} + result_dict.update(thousand) + result_dict.update(hundred) + return result_dict + thousand = ["тисяч", "тисячі", "тисячу", "тисячах"] + result_dict = {value + suffix for value in originals for suffix in suffixes} + result_dict.update(thousand) + return {value + suffix for value in originals for suffix in suffixes} + + +_STRING_NUM_UK = invert_dict(_NUM_STRING_UK) + +_STRING_NUM_UK.update(generate_plurals_uk(_STRING_NUM_UK)) +_STRING_NUM_UK.update(_PLURALS) +_STRING_NUM_UK.update({ + "трильйон": 1e18, + "половина": 0.5, "половиною": 0.5, "половини": 0.5, "половин": 0.5, "половинами": 0.5, "пів": 0.5, + "одна": 1, "одної": 1, "одній": 1, "одну": 1 +}) + + +def extract_duration_uk(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + # Ukrainian inflection for time: хвилина, хвилини, хвилин - safe to use хвилина as pattern + # For day: день, дня, днів - short pattern not applicable, list all + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}(?:ів|я|и|ин|і|унд|ни|ну|ку|дні|у|днів)?" + text = _convert_words_to_numbers_uk(text) + + for (unit_uk, unit_en) in _TIME_UNITS_CONVERSION.items(): + unit_pattern = pattern.format(unit=unit_uk) + + def repl(match): + time_units[unit_en] += float(match.group(1)) + return '' + + text = re.sub(unit_pattern, repl, text) + + new_text = [] + tokens_in_result_text = text.split(' ') + for token in tokens_in_result_text: + if not token.isdigit(): + new_text.append(token) + text = " ".join(new_text).strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return duration, text + + +def extract_datetime_uk(text, anchor_date=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchor_date (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + # Normalize Ukrainian inflection + s = s.lower().replace('?', '').replace('.', '').replace(',', '') + s = s.replace("сьогодні вечером|сьогодні ввечері|вечором", "ввечері") + s = s.replace("сьогодні вночі", "вночі") + word_list = s.split() + + for idx, word in enumerate(word_list): + ########## + # Ukrainian Day Ordinals - we do not use 1st,2nd format + # instead we use full ordinal number names with specific format(suffix) + # Example: двадцять третього - 23 + count_ordinals = 0 + if word == "третього": + count_ordinals = 3 + # Example: тридцять першого - 31 + elif word.endswith("ого"): + tmp = word[:-3] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp: + count_ordinals = nr + # Example: тридцять перше > 31 + elif word.endswith("є") or word.endswith("е"): + tmp = word[:-1] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp: + count_ordinals = nr + # If number is bigger than 19 check if next word is also ordinal + # and count them together + if count_ordinals > 19: + if word_list[idx + 1] == "третього": + count_ordinals += 3 + elif word_list[idx + 1].endswith("ого"): + tmp = word_list[idx + 1][:-3] + tmp += "ий" + for nr, name in _ORDINAL_BASE_UK.items(): + if name == tmp and nr < 10: + # write only if sum makes acceptable count of days in month + if (count_ordinals + nr) <= 31: + count_ordinals += nr + + if count_ordinals > 0: + word = str(count_ordinals) # Write normalized value into word + if count_ordinals > 20: + # If counted number is greater than 20, clear next word so it is not used again + word_list[idx + 1] = "" + ########## + # Remove inflection from Ukrainian months + word_list[idx] = word + return word_list + + def date_found(): + return found or \ + ( + date_string != "" or + year_offset != 0 or month_offset != 0 or + day_offset is True or hr_offset != 0 or + hr_abs or min_offset != 0 or + min_abs or sec_offset != 0 + ) + + if text == "": + return None + + anchor_date = anchor_date or now_local() + found = False + day_specified = False + day_offset = False + month_offset = 0 + year_offset = 0 + today = anchor_date.strftime("%w") + current_year = anchor_date.strftime("%Y") + from_flag = False + date_string = "" + has_year = False + time_qualifier = "" + + time_qualifiers_am = _WORDS_MORNING_UK + time_qualifiers_pm = ['дня', 'вечора'] + time_qualifiers_pm.extend(_WORDS_DAY_UK) + time_qualifiers_pm.extend(_WORDS_EVENING_UK) + time_qualifiers_pm.extend(_WORDS_NIGHT_UK) + time_qualifiers_list = set(time_qualifiers_am + time_qualifiers_pm) + markers = ['на', 'у', 'в', 'о', 'до', 'це', + 'біля', 'цей', 'через', 'після', 'за', 'той'] + days = ["понеділок", "вівторок", "середа", + "четвер", "п'ятниця", "субота", "неділя"] + months = _MONTHS_UK + recur_markers = days + ['вихідні', 'вікенд'] + months_short = ["січ", "лют", "бер", "квіт", "трав", "червень", "лип", "серп", + "верес", "жовт", "листоп", "груд"] + year_multiples = ["десятиліття", "століття", "тисячоліття", "тисячоліть", "століть", + "сторіччя", "сторіч"] + + words = clean_string(text) + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + + word = _text_uk_inflection_normalize(word, 2) + word_prev_prev = _text_uk_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_uk_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_uk_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_uk_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + if word in _WORDS_NOW_UK and not date_string: + result_str = " ".join(words[idx + 1:]) + result_str = ' '.join(result_str.split()) + extracted_date = anchor_date.replace(microsecond=0) + return [extracted_date, result_str] + elif word_next in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_uk(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if word_next == "десятиліття" or word_next == "декада": + year_offset = multiplier * 10 + elif word_next == "століття" or word_next == "сторіччя": + year_offset = multiplier * 100 + elif word_next in ["тисячоліття", "тисячоліть"]: + year_offset = multiplier * 1000 + elif word_next in ["тисяча", "тисячі", "тисяч"]: + year_offset = multiplier * 1000 + elif word in time_qualifiers_list and preposition != "через" and word_next != "тому": + time_qualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "сьогодні" and not from_flag: + day_offset = 0 + used += 1 + elif word == "завтра" and not from_flag: + day_offset = 1 + used += 1 + elif word == "післязавтра" and not from_flag: + day_offset = 2 + used += 1 + elif word == "після" and word_next == "завтра" and not from_flag: + day_offset = 2 + used += 2 + elif word == "позавчора" and not from_flag: + day_offset = -2 + used += 1 + elif word == "вчора" and not from_flag: + day_offset = -1 + used += 1 + elif (word in ["день", "дня", "дні", "днів"] and + word_next == "після" and + word_next_next == "завтра" and + not from_flag and + (not word_prev or not word_prev[0].isdigit())): + day_offset = 2 + used = 2 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and preposition == "через": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and word_next == "тому": + if word_prev and word_prev[0].isdigit(): + day_offset += -int(word_prev) + start -= 1 + used = 3 + elif word in ["день", "дня", "дні", "днів"] and is_numeric(word_prev) and word_prev_prev == "на": + if word_prev and word_prev[0].isdigit(): + day_offset += int(word_prev) + start -= 1 + used = 2 + elif word == "сьогодні" and not from_flag and word_prev: + if word_prev[0].isdigit(): + day_offset += int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + day_offset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "тиждень" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + day_offset = int(word_prev) * 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + day_offset = 7 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + day_offset = -7 + start -= 1 + used = 2 + elif word == "місяць" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + month_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + month_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + month_offset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "рік" and not from_flag and preposition in ["через", "на"]: + if word_prev[0].isdigit(): + if word_prev_prev[0].isdigit(): + year_offset = int(word_prev) * int(word_prev_prev) + else: + year_offset = int(word_prev) + start -= 1 + used = 2 + elif word_prev in _WORDS_NEXT_UK: + year_offset = 1 + start -= 1 + used = 2 + elif word_prev in _WORDS_PREV_UK: + year_offset = -1 + start -= 1 + used = 2 + elif word_prev == "через": + year_offset = 1 + used = 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not from_flag: + d = days.index(word) + day_offset = (d + 1) - int(today) + used = 1 + if day_offset < 0: + day_offset += 7 + if word_prev in _WORDS_NEXT_UK: + if day_offset <= 2: + day_offset += 7 + used += 1 + start -= 1 + elif word_prev in _WORDS_PREV_UK: + day_offset -= 7 + used += 1 + start -= 1 + elif word in months or word in months_short and not from_flag: + try: + m = months.index(word) + except ValueError: + m = months_short.index(word) + used += 1 + # Convert Ukrainian months to english + date_string = _MONTHS_CONVERSION.get(m) + if word_prev and (word_prev[0].isdigit() or + (word_prev == " " and word_prev_prev[0].isdigit())): + if word_prev == " " and word_prev_prev[0].isdigit(): + date_string += " " + words[idx - 2] + used += 1 + start -= 1 + else: + date_string += " " + word_prev + start -= 1 + used += 1 + if word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + has_year = True + else: + has_year = False + + elif word_next and word_next[0].isdigit(): + date_string += " " + word_next + used += 1 + if word_next_next and word_next_next[0].isdigit(): + date_string += " " + word_next_next + used += 1 + has_year = True + else: + has_year = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + valid_followups = days + months + months_short + valid_followups.append("сьогодні") + valid_followups.append("завтра") + valid_followups.append("післязавтра") + valid_followups.append("вчора") + valid_followups.append("позавчора") + for followup in _WORDS_NEXT_UK: + valid_followups.append(followup) + for followup in _WORDS_PREV_UK: + valid_followups.append(followup) + for followup in _WORDS_CURRENT_UK: + valid_followups.append(followup) + for followup in _WORDS_NOW_UK: + valid_followups.append(followup) + if (word in ["до", "по", "з"]) and word_next in valid_followups: + used = 2 + from_flag = True + if word_next == "завтра": + day_offset += 1 + elif word_next == "післязавтра": + day_offset += 2 + elif word_next == "вчора": + day_offset -= 1 + elif word_next == "позавчора": + day_offset -= 2 + elif word_next in days: + d = days.index(word_next) + tmp_offset = (d + 1) - int(today) + used = 2 + if tmp_offset < 0: + tmp_offset += 7 + day_offset += tmp_offset + elif word_next_next and word_next_next in days: + d = days.index(word_next_next) + tmp_offset = (d + 1) - int(today) + used = 3 + if word_next in _WORDS_NEXT_UK: + if day_offset <= 2: + tmp_offset += 7 + used += 1 + start -= 1 + elif word_next in _WORDS_PREV_UK: + tmp_offset -= 7 + used += 1 + start -= 1 + day_offset += tmp_offset + if used > 0: + if start - 1 > 0 and (words[start - 1] in _WORDS_CURRENT_UK): + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + day_specified = True + + # parse time + hr_offset = 0 + min_offset = 0 + sec_offset = 0 + hr_abs = None + min_abs = None + military = False + preposition = "" + + for idx, word in enumerate(words): + if word == "": + continue + + if word in markers: + preposition = word + word = _text_uk_inflection_normalize(word, 1) + word_prev_prev = _text_uk_inflection_normalize( + words[idx - 2], 2) if idx > 1 else "" + word_prev = _text_uk_inflection_normalize( + words[idx - 1], 2) if idx > 0 else "" + word_next = _text_uk_inflection_normalize( + words[idx + 1], 2) if idx + 1 < len(words) else "" + word_next_next = _text_uk_inflection_normalize( + words[idx + 2], 2) if idx + 2 < len(words) else "" + + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "опівдні": + hr_abs = 12 + used += 1 + elif word == "північ": + hr_abs = 0 + used += 1 + elif word in _STRING_NUM_UK: + val = _STRING_NUM_UK.get(word) + elif word in _WORDS_MORNING_UK: + if hr_abs is None: + hr_abs = 8 + used += 1 + elif word in _WORDS_DAY_UK: + if hr_abs is None: + hr_abs = 15 + used += 1 + elif word in _WORDS_EVENING_UK: + if hr_abs is None: + hr_abs = 19 + used += 1 + if word_next != "" and word_next[0].isdigit() and ":" in word_next: + used -= 1 + elif word in _WORDS_NIGHT_UK: + if hr_abs is None: + hr_abs = 22 + # parse half an hour, quarter hour + # should be added different variations oh "hour forms" + elif word in ["година", "годину", "години"] and \ + (word_prev in markers or word_prev_prev in markers): + if word_prev in ["пів", "половина", "опів на", "опів"]: + min_offset = 30 + elif word_prev == "чверть": + min_offset = 15 + # parse in an hour + elif word_prev == "через": + hr_offset = 1 + else: + hr_offset = 1 + if word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_UK: + day_specified = True + words[idx - 1] = "" + used += 1 + hr_abs = -1 + min_abs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "хвилину" and word_prev == "через": + min_offset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "секунду" and word_prev == "через": + sec_offset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + is_time = True + str_hh = "" + str_mm = "" + remainder = "" + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + if word_next in _WORDS_EVENING_UK or word_next in _WORDS_NIGHT_UK or word_next_next in _WORDS_EVENING_UK \ + or word_next_next in _WORDS_NIGHT_UK or word_prev in _WORDS_EVENING_UK \ + or word_prev in _WORDS_NIGHT_UK or word_prev_prev in _WORDS_EVENING_UK \ + or word_prev_prev in _WORDS_NIGHT_UK or word_next_next_next in _WORDS_EVENING_UK \ + or word_next_next_next in _WORDS_NIGHT_UK: + remainder = "pm" + used += 1 + if word_prev in _WORDS_EVENING_UK or word_prev in _WORDS_NIGHT_UK: + words[idx - 1] = "" + if word_prev_prev in _WORDS_EVENING_UK or word_prev_prev in _WORDS_NIGHT_UK: + words[idx - 2] = "" + if word_next_next in _WORDS_EVENING_UK or word_next_next in _WORDS_NIGHT_UK: + used += 1 + if word_next_next_next in _WORDS_EVENING_UK or word_next_next_next in _WORDS_NIGHT_UK: + used += 1 + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + str_hh += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + str_mm += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + hour = ["година", "годині"] + next_word = word_next.replace(".", "") + if next_word in ["am", "pm", "ночі", "ранку", "дня", "вечора"]: + remainder = next_word + used += 1 + # question with the case "година" + elif next_word in hour and word_next_next in ["am", "pm", "ночи", "утра", "дня", "вечера"]: + remainder = word_next_next + used += 2 + elif word_next in _WORDS_MORNING_UK: + remainder = "am" + used += 2 + elif word_next in _WORDS_DAY_UK: + remainder = "pm" + used += 2 + elif word_next in _WORDS_EVENING_UK: + remainder = "pm" + used += 2 + elif word_next == "цього" and word_next_next in _WORDS_MORNING_UK: + remainder = "am" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_DAY_UK: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "на" and word_next_next in _WORDS_EVENING_UK: + remainder = "pm" + used = 2 + day_specified = True + elif word_next == "в" and word_next_next in _WORDS_NIGHT_UK: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif word_next == "о" and word_next_next in _WORDS_NIGHT_UK: + if str_hh and int(str_hh) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + elif hr_abs and hr_abs != -1: + if hr_abs >= 12: + remainder = "pm" + else: + remainder = "am" + used += 1 + else: + if time_qualifier != "": + military = True + if str_hh and int(str_hh) <= 12 and \ + (time_qualifier in time_qualifiers_pm): + str_hh += str(int(str_hh) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + str_num = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + str_num += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = word_next.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечора" or + word_next == "вечора"): + str_hh = str_num + remainder = "pm" + used = 1 + if ( + remainder == "pm" or + word_next == "pm" or + remainder == "p.m." or + word_next == "p.m." or + (remainder == "дня" and preposition != 'через') or + (word_next == "дня" and preposition != 'через') or + remainder == "вечора" or + word_next == "вечора"): + str_hh = str_num + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + word_next == "am" or + remainder == "a.m." or + word_next == "a.m." or + remainder == "ночі" or + word_next == "ночі" or + remainder == "ранку" or + word_next == "ранку"): + str_hh = str_num + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + word_next in recur_markers or + word_next_next in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set str_hh so that is_time == True + # when am or pm is not specified + str_hh = str_num + used = 1 + else: + if int(str_num) > 100: + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "година": + used += 1 + elif ( + (word_next == "година" or word_next == "годину" or + remainder == "година") and + word[0] != '0' and + # (wordPrev != "в" and wordPrev != "на") + word_prev == "через" + and + ( + int(str_num) < 100 or + int(str_num) > 2400 + )): + # ignores military time + # "in 3 hours" + hr_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "хвилина" or \ + remainder == "хвилина": + # "in 10 minutes" + min_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif word_next == "секунда" \ + or remainder == "секунда": + # in 5 seconds + sec_offset = int(str_num) + used = 2 + is_time = False + hr_abs = -1 + min_abs = -1 + elif int(str_num) > 100: + # military time, eg. "3300 hours" + str_hh = str(int(str_num) // 100) + str_mm = str(int(str_num) % 100) + military = True + if word_next == "час" or \ + remainder == "час": + used += 1 + elif word_next and word_next[0].isdigit(): + # military time, e.g. "04 38 hours" + str_hh = str_num + str_mm = word_next + military = True + used += 1 + if (word_next_next == "година" or + remainder == "час"): + used += 1 + elif ( + word_next == "" or word_next == "година" or + ( + (word_next == "в" or word_next == "на") and + ( + word_next_next == time_qualifier + ) + ) or word_next in _WORDS_EVENING_UK or + word_next_next in _WORDS_EVENING_UK): + + str_hh = str_num + str_mm = "00" + if word_next == "година": + used += 1 + if (word_next == "о" or word_next == "на" + or word_next_next == "о" or word_next_next == "на"): + used += (1 if (word_next == + "о" or word_next == "на") else 2) + word_next_next_next = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (word_next_next and + (word_next_next in time_qualifier or + word_next_next_next in time_qualifier)): + if (word_next_next in time_qualifiers_pm or + word_next_next_next in time_qualifiers_pm): + remainder = "pm" + used += 1 + if (word_next_next in time_qualifiers_am or + word_next_next_next in time_qualifiers_am): + remainder = "am" + used += 1 + + if time_qualifier != "": + if time_qualifier in time_qualifiers_pm: + remainder = "pm" + used += 1 + + elif time_qualifier in time_qualifiers_am: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + elif remainder == "година": + if word_next_next in ["ночі", "ранку"]: + remainder = "am" + used += 1 + elif word_next_next in ["дня", "вечора"]: + remainder = "pm" + used += 1 + else: + remainder = "" + + else: + is_time = False + hh = int(str_hh) if str_hh else 0 + mm = int(str_mm) if str_mm else 0 + hh = hh + 12 if remainder == "pm" and hh < 12 else hh + hh = hh - 12 if remainder == "am" and hh >= 12 else hh + if (not military and + remainder not in ['am', 'pm', 'година', 'хвилина', 'секунда'] and + ((not day_specified) or 0 <= day_offset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if anchor_date.hour < hh or (anchor_date.hour == hh and + anchor_date.minute < mm): + pass # No modification needed + elif anchor_date.hour < hh + 12: + hh += 12 + else: + # has passed, assume the next morning + day_offset += 1 + if time_qualifier in time_qualifiers_pm and hh < 12: + hh += 12 + + if hh > 24 or mm > 59: + is_time = False + used = 0 + if is_time: + hr_abs = hh + min_abs = mm + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + # if wordPrev == "o" or wordPrev == "oh": + # words[words.index(wordPrev)] = "" + + if word_prev == "скоро": + hr_offset = -1 + words[idx - 1] = "" + idx -= 1 + elif word_prev == "пізніше": + hr_offset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and word_prev in markers: + words[idx - 1] = "" + if word_prev in _WORDS_CURRENT_UK: + day_specified = True + if idx > 1 and word_prev_prev in markers: + words[idx - 2] = "" + if word_prev_prev in _WORDS_CURRENT_UK: + day_specified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if day_offset is False: + day_offset = 0 + + # perform date manipulation + + extracted_date = anchor_date.replace(microsecond=0) + if date_string != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(date_string, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(date_string, "%B %d %Y") + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + if not has_year: + temp = temp.replace(year=extracted_date.year, + tzinfo=extracted_date.tzinfo) + if extracted_date < temp: + extracted_date = extracted_date.replace( + year=int(current_year), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(current_year) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + extracted_date = extracted_date.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extracted_date.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hr_offset == 0 and min_offset == 0 and sec_offset == 0: + extracted_date = extracted_date.replace(hour=0, minute=0, second=0) + + if year_offset != 0: + extracted_date = extracted_date + relativedelta(years=year_offset) + if month_offset != 0: + extracted_date = extracted_date + relativedelta(months=month_offset) + if day_offset != 0: + extracted_date = extracted_date + relativedelta(days=day_offset) + if hr_abs != -1 and min_abs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hr_abs is None and min_abs is None and default_time is not None: + hr_abs, min_abs = default_time.hour, default_time.minute + else: + hr_abs = hr_abs or 0 + min_abs = min_abs or 0 + + extracted_date = extracted_date + relativedelta(hours=hr_abs, + minutes=min_abs) + if (hr_abs != 0 or min_abs != 0) and date_string == "": + if not day_specified and anchor_date > extracted_date: + extracted_date = extracted_date + relativedelta(days=1) + if hr_offset != 0: + extracted_date = extracted_date + relativedelta(hours=hr_offset) + if min_offset != 0: + extracted_date = extracted_date + relativedelta(minutes=min_offset) + if sec_offset != 0: + extracted_date = extracted_date + relativedelta(seconds=sec_offset) + for idx, word in enumerate(words): + if words[idx] == "і" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + result_str = " ".join(words) + result_str = ' '.join(result_str.split()) + return [extracted_date, result_str] + + +def nice_time_uk(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if dt.hour < 4: + string += " ночі" + elif dt.hour < 12: + string += " ранку" + elif dt.hour < 18: + string += " дня" + else: + string += " вечора" + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak = pronounce_hour_uk(int(string[0])) + if not speak: + speak = pronounce_number_uk(int(string[0])) + ' ' + speak += pronounce_number_uk(int(string[1])) + else: + speak = pronounce_hour_uk(int(string[0:2])) + if speak is None: + speak = pronounce_number_uk(int(string[0:2])) + + speak += " " + if string[3:5] == '00': + speak += "рівно" + else: + if string[3] == '0': + speak += pronounce_number_uk(0) + " " + speak += pronounce_number_uk(int(string[4])) + else: + speak += pronounce_number_uk(int(string[3:5])) + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "опівночі" + elif dt.hour == 12 and dt.minute == 0: + return "опівдні" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = "чверть після " + pronounce_hour_genitive_uk(hour) + elif dt.minute == 30: + speak = "половина після " + pronounce_hour_genitive_uk(hour) + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "без четверті " + pronounce_hour_uk(next_hour) + else: + speak = pronounce_hour_uk(hour) + + if use_ampm: + if dt.hour < 4: + speak += " ночі" + elif dt.hour < 12: + speak += " ранку" + elif dt.hour < 18: + speak += " дня" + else: + speak += " вечора" + + if dt.minute == 0: + if not use_ampm: + if dt.hour % 12 == 1: + return speak + # TODO: the `one`/`few`/`many` structure doesn't cover + # all cases in Ukrainian + return speak + " " + plural_uk(dt.hour % 12, one="година", + few="години", many="годин") + else: + if dt.minute < 10: + speak += " нуль" + speak += " " + pronounce_number_uk(dt.minute) + + return speak + + +def nice_duration_uk(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + if not speech: + raise NotImplementedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + + if days > 0: + out += pronounce_number_uk(days) + out += " " + plural_uk(days, "день", "дня", "днів") + if hours > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(hours) + if out == 'один': + out = 'одна' + out += " " + plural_uk(hours, "година", "години", "годин") + if minutes > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(minutes) + out += " " + plural_uk(minutes, "хвилина", "хвилини", "хвилин") + if seconds > 0: + if out: + out += " " + out += pronounce_number_feminine_uk(seconds) + out += " " + plural_uk(seconds, "секунда", "секунди", "секунд") + + return out + + +def pronounce_hour_uk(num): + if num in HOURS_UK.keys(): + return HOURS_UK[num] + ' година' + + +def pronounce_mins_uk(num): + if num in _NUM_STRING_UK.keys(): + if num == 1: + return 'одна хвилина' + if num == 2: + return 'дві хвилини' + if num in [10, 20, 30, 40, 50, 60]: + _NUM_STRING_UK[num] + 'хвилин' + else: + return + + +def pronounce_hour_genitive_uk(num): + if num in HOURS_UK.keys(): + if num == 3: + gen_hour = HOURS_UK[num][:-1] + 'ьої' + else: + gen_hour = HOURS_UK[num][:-1] + 'ої' + return gen_hour + ' години' + + +def pronounce_number_feminine_uk(num): + pronounced = pronounce_number_uk(num) + + num %= 100 + if num % 10 == 1 and num // 10 != 1: + return pronounced[:-2] + "на" + elif num % 10 == 2 and num // 10 != 1: + return pronounced[:-1] + "і" + + return pronounced + + +def plural_uk(num: int, one: str, few: str, many: str): + num %= 100 + if num // 10 == 1: + return many + if num % 10 == 1: + return one + if 2 <= num % 10 <= 4: + return few + return many + + +def _text_uk_inflection_normalize(word, arg): + """ + Ukrainian Inflection normalizer. + + This try to normalize known inflection. This function is called + from multiple places, each one is defined with arg. + + Args: + word [Word] + arg [Int] + + Returns: + word [Word] + + """ + + if arg == 1: # _extract_whole_number_with_text_uk + if word in ["одна", "одним", "одно", "одною", "одного", "одної", "одному", "одній", "одного", "одну"]: + return "один" + return _plurals_normalizer(word) + + elif arg == 2: # extract_datetime_uk + if word in ["година", "години", "годин", "годину", "годин", "годинами"]: + return "година" + if word in ["хвилина", "хвилини", "хвилину", "хвилин", "хвилька"]: + return "хвилина" + if word in ["секунд", "секунди", "секундами", "секунду", "секунд", "сек"]: + return "секунда" + if word in ["днів", "дні", "днями", "дню", "днем", "днями"]: + return "день" + if word in ["тижні", "тижнів", "тижнями", "тиждень", "тижня"]: + return "тиждень" + if word in ["місяцем", "місяці", "місяця", "місяцях", "місяцем", "місяцями", "місяців"]: + return "місяць" + if word in ["року", "роки", "році", "роках", "роком", "роками", "років"]: + return "рік" + if word in _WORDS_MORNING_UK: + return "вранці" + if word in ["опівдні", "півдня"]: + return "південь" + if word in _WORDS_EVENING_UK: + return "ввечері" + if word in _WORDS_NIGHT_UK: + return "ніч" + if word in ["вікенд", "вихідних", "вихідними"]: + return "вихідні" + if word in ["столітті", "століттях", "століть"]: + return "століття" + if word in ["десятиліття", "десятиліть", "десятиліттях"]: + return "десятиліття" + if word in ["столітті", "століттях", "століть"]: + return "століття" + + # Week days + if word in ["понеділка", "понеділки"]: + return "понеділок" + if word in ["вівторка", "вівторки"]: + return "вівторок" + if word in ["середу", "середи"]: + return "среда" + if word in ["четверга"]: + return "четвер" + if word in ["п'ятницю", "п'ятниці"]: + return "п'ятниця" + if word in ["суботу", "суботи"]: + return "субота" + if word in ["неділю", "неділі"]: + return "неділя" + + # Months + if word in ["лютому", "лютого", "лютим"]: + return "лютий" + if word in ["листопада", "листопаді", "листопадом"]: + return "листопад" + tmp = '' + if word[-3:] in ["ого", "ому"]: + tmp = word[:-3] + "ень" + elif word[-2:] in ["ні", "ня"]: + tmp = word[:-2] + "ень" + for name in _MONTHS_UK: + if name == tmp: + return name + return word + + +def _plurals_normalizer(word): + """ + Ukrainian Plurals normalizer. + + This function normalizes plural endings of numerals + including different case variations. + Uses _PLURALS dictionary with exceptions that can not + be covered by rules. + Args: + word [Word] + + Returns: + word [Word] + + """ + if word not in _STRING_NUM_UK: + # checking for plurals 2-10 + for key, value in _PLURALS.items(): + if word == key: + return _NUM_STRING_UK[value] + + # checking for plurals 11-19 + case_endings = ['надцяти', 'надцятим', 'надцятими', + 'надцятьох', 'надцятьма', 'надцятьома', 'надцятьом'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + if 'один' in word: + return "одинадцять" + word = word.replace(plural_case, '') + 'надцять' + return word + + # checking for plurals 20,30 + case_endings = ['дцяти', 'дцятим', 'дцятими', + 'дцятьох', 'дцятьма', 'дцятьома', 'дцятьом'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '') + 'дцять' + return word + + # checking for plurals 50, 60, 70, 80 + case_endings = ['десятьох', 'десяти', 'десятьом', + 'десятьма', 'десятьома'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '') + 'десят' + return word + + # checking for plurals 90, 100 + case_endings = ['стам', 'стами', 'стах', + 'стами', 'ста', 'сот'] + plural_case = ''.join([case for case in case_endings if case in word]) + if plural_case: + word = word.replace(plural_case, '') + for key, value in _PLURALS.items(): + if word == key: + firs_part = _NUM_STRING_UK[value] + if value in [3, 4]: + word = firs_part + 'ста' + elif value in [5, 6, 9]: + word = firs_part[:-1] + 'сот' + elif value in [7, 8]: + word = firs_part + 'сот' + return word + return word + return word diff --git a/ovos_date_parser/version.py b/ovos_date_parser/version.py new file mode 100644 index 0000000..8d95a28 --- /dev/null +++ b/ovos_date_parser/version.py @@ -0,0 +1,7 @@ +# The following lines are replaced during the release process. +# START_VERSION_BLOCK +VERSION_MAJOR = 0 +VERSION_MINOR = 0 +VERSION_BUILD = 1 +VERSION_ALPHA = 1 +# END_VERSION_BLOCK diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e59ac42 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +python-dateutil~=2.6 +quebra_frases>=0.3.7 +ovos-number-parser \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c56a5d2 --- /dev/null +++ b/setup.py @@ -0,0 +1,81 @@ +import os + +from setuptools import setup + +BASEDIR = os.path.abspath(os.path.dirname(__file__)) + + +def package_files(directory): + paths = [] + for (path, directories, filenames) in os.walk(directory): + for filename in filenames: + paths.append(os.path.join('..', path, filename)) + return paths + + +def required(requirements_file): + """ Read requirements file and remove comments and empty lines. """ + with open(os.path.join(os.path.dirname(__file__), requirements_file), + 'r') as f: + requirements = f.read().splitlines() + return [pkg for pkg in requirements + if pkg.strip() and not pkg.startswith("#")] + + +def get_version(): + """ Find the version of the package""" + version = None + version_file = os.path.join(BASEDIR, 'ovos_date_parser', 'version.py') + major, minor, build, alpha = (None, None, None, None) + with open(version_file) as f: + for line in f: + if 'VERSION_MAJOR' in line: + major = line.split('=')[1].strip() + elif 'VERSION_MINOR' in line: + minor = line.split('=')[1].strip() + elif 'VERSION_BUILD' in line: + build = line.split('=')[1].strip() + elif 'VERSION_ALPHA' in line: + alpha = line.split('=')[1].strip() + + if ((major and minor and build and alpha) or + '# END_VERSION_BLOCK' in line): + break + version = f"{major}.{minor}.{build}" + if alpha and int(alpha) > 0: + version += f"a{alpha}" + return version + + +extra_files = package_files('ovos_date_parser') + +with open(f"{BASEDIR}/README.md", "r") as fh: + long_description = fh.read() + +setup( + name='ovos-date-parser', + version=get_version(), + packages=['ovos_date_parser'], + url='https://github.com/OpenVoiceOS/ovos-date-parser', + obsoletes=['ovos_date_parser'], + license='Apache2.0', + package_data={'': extra_files}, + include_package_data=True, + install_requires=required('requirements.txt'), + author='Mycroft AI / OVOS', + author_email='jarbasai@mailfence.com', + description='OpenVoiceOS\'s multilingual text parsing and formatting library', + long_description=long_description, + long_description_content_type="text/markdown", + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'Topic :: Text Processing :: Linguistic', + 'License :: OSI Approved :: Apache Software License', + + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + ] +)