From db7cf7ce66e43e51356b8ee1fa2003b0d08ad541 Mon Sep 17 00:00:00 2001 From: Christopher Lam Date: Mon, 2 Sep 2024 00:21:35 +0800 Subject: [PATCH] [gnc-datetime] improve CSV date parser with ICU and boost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Add dateformat "Locale" with ICU; uses current locale for date parsing. ICU's locale date parser may parse "3 May 2023" or "2024年9月13日" (LC_TIME=zh_TW.utf8) and maybe others. 2. Augment d-m-y m-d-y and y-m-d with boost UK/US/ISO parsers. This allows CSV import of dates with months as words as "30 Sep 2023" or "May 4, 1978" or "2023-Dec-25". Note boost parser cannot recognise 2-digit years, therefore "30 Sep 24" is invalid. --- libgnucash/engine/gnc-datetime.cpp | 87 ++++++++++++++++++- libgnucash/engine/gnc-datetime.hpp | 11 +++ libgnucash/engine/test/gtest-gnc-datetime.cpp | 38 ++++++++ 3 files changed, 134 insertions(+), 2 deletions(-) diff --git a/libgnucash/engine/gnc-datetime.cpp b/libgnucash/engine/gnc-datetime.cpp index 308ec24e4ef..fda7c255b50 100644 --- a/libgnucash/engine/gnc-datetime.cpp +++ b/libgnucash/engine/gnc-datetime.cpp @@ -29,6 +29,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -70,6 +75,8 @@ static const TZ_Ptr utc_zone(new boost::local_time::posix_time_zone("UTC-0")); void _set_tzp(TimeZoneProvider& tz); void _reset_tzp(); +static Date gregorian_date_from_locale_string (const std::string& str); + /* To ensure things aren't overly screwed up by setting the nanosecond clock for boost::date_time. Don't do it, though, it doesn't get us anything and slows down the date/time library. */ #ifndef BOOST_DATE_TIME_HAS_NANOSECONDS static constexpr auto ticks_per_second = INT64_C(1000000); @@ -78,7 +85,7 @@ static constexpr auto ticks_per_second = INT64_C(1000000000); #endif /* Vector of date formats understood by gnucash and corresponding regex - * to parse each from an external source + * and/or string->gregorian_date to parse each from an external source * Note: while the format names are using a "-" as separator, the * regexes will accept any of "-/.' " and will also work for dates * without separators. @@ -86,6 +93,7 @@ static constexpr auto ticks_per_second = INT64_C(1000000000); const std::vector GncDate::c_formats ({ GncDateFormat { N_("y-m-d"), + boost::gregorian::from_string, "(?:" // either y-m-d "(?[0-9]+)[-/.' ]+" "(?[0-9]+)[-/.' ]+" @@ -98,6 +106,7 @@ const std::vector GncDate::c_formats ({ }, GncDateFormat { N_("d-m-y"), + boost::gregorian::from_uk_string, "(?:" // either d-m-y "(?[0-9]+)[-/.' ]+" "(?[0-9]+)[-/.' ]+" @@ -110,6 +119,7 @@ const std::vector GncDate::c_formats ({ }, GncDateFormat { N_("m-d-y"), + boost::gregorian::from_us_string, "(?:" // either m-d-y "(?[0-9]+)[-/.' ]+" "(?[0-9]+)[-/.' ]+" @@ -145,7 +155,8 @@ const std::vector GncDate::c_formats ({ "(?[0-9]{2})" "(?[0-9]+)?" ")" - } + }, + GncDateFormat { N_("Locale"), gregorian_date_from_locale_string }, }); /** Private implementation of GncDateTime. See the documentation for that class. @@ -607,6 +618,65 @@ GncDateTimeImpl::timestamp() return str.substr(0, 8) + str.substr(9, 15); } +struct ICUResources +{ + std::unique_ptr formatter; + std::unique_ptr calendar; +}; + +static ICUResources& +get_icu_resources() +{ + static ICUResources rv; + + if (!rv.formatter) + { + icu::Locale locale; + if (auto lc_time_locale = setlocale (LC_TIME, nullptr)) + { + std::string localeStr(lc_time_locale); + if (size_t dotPos = localeStr.find('.'); dotPos != std::string::npos) + localeStr = localeStr.substr(0, dotPos); + + locale = icu::Locale::createCanonical (localeStr.c_str()); + } + + rv.formatter.reset(icu::DateFormat::createDateInstance(icu::DateFormat::kDefault, locale)); + if (!rv.formatter) + throw std::invalid_argument("Cannot create date formatter."); + + UErrorCode status = U_ZERO_ERROR; + rv.calendar.reset(icu::Calendar::createInstance(locale, status)); + if (U_FAILURE(status)) + throw std::invalid_argument("Cannot create calendar instance."); + + rv.calendar->setLenient(false); + } + + return rv; +} + +Date +static gregorian_date_from_locale_string (const std::string& str) +{ + ICUResources& resources = get_icu_resources(); + + icu::UnicodeString input = icu::UnicodeString::fromUTF8(str); + icu::ParsePosition parsePos; + UDate date = resources.formatter->parse(input, parsePos); + if (parsePos.getErrorIndex() != -1 || parsePos.getIndex() != input.length()) + throw std::invalid_argument ("Cannot parse string"); + + UErrorCode status = U_ZERO_ERROR; + resources.calendar->setTime(date, status); + if (U_FAILURE(status)) + throw std::invalid_argument ("Cannot set calendar time"); + + return Date (resources.calendar->get(UCAL_YEAR, status), + resources.calendar->get(UCAL_MONTH, status) + 1, + resources.calendar->get(UCAL_DATE, status)); +} + /* Member function definitions for GncDateImpl. */ GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) : @@ -617,6 +687,19 @@ GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) : if (iter == GncDate::c_formats.cend()) throw std::invalid_argument(N_("Unknown date format specifier passed as argument.")); + if (iter->m_str_to_date) + { + try + { + m_greg = (*iter->m_str_to_date)(str); + return; + } + catch (...) {} // with any string->date exception, try regex + } + + if (iter->m_re.empty()) + throw std::invalid_argument ("No regex pattern available"); + boost::regex r(iter->m_re); boost::smatch what; if(!boost::regex_search(str, what, r)) // regex didn't find a match diff --git a/libgnucash/engine/gnc-datetime.hpp b/libgnucash/engine/gnc-datetime.hpp index 77a6039b10a..6c150a92225 100644 --- a/libgnucash/engine/gnc-datetime.hpp +++ b/libgnucash/engine/gnc-datetime.hpp @@ -29,6 +29,10 @@ #include #include #include +#include +#include + +#include typedef struct { @@ -172,6 +176,8 @@ class GncDateTime * GncDate::c_formats class variable and work with those. */ +using StringToDate = std::function; + class GncDateFormat { public: @@ -182,6 +188,10 @@ class GncDateFormat */ GncDateFormat (const char* fmt, const char* re) : m_fmt(fmt), m_re(re) {} + GncDateFormat (const char* fmt, StringToDate str_to_date, const char* re) : + m_fmt(fmt), m_re(re), m_str_to_date(str_to_date) {} + GncDateFormat (const char* fmt, StringToDate str_to_date) : + m_fmt(fmt), m_str_to_date(str_to_date) {} /** A string representing the format. */ const std::string m_fmt; private: @@ -189,6 +199,7 @@ class GncDateFormat * only be used internally by the gnc-datetime code. */ const std::string m_re; + std::optional m_str_to_date; friend class GncDateImpl; }; diff --git a/libgnucash/engine/test/gtest-gnc-datetime.cpp b/libgnucash/engine/test/gtest-gnc-datetime.cpp index 70b8b1a614f..b1d0efaf4cf 100644 --- a/libgnucash/engine/test/gtest-gnc-datetime.cpp +++ b/libgnucash/engine/test/gtest-gnc-datetime.cpp @@ -89,6 +89,12 @@ TEST(gnc_date_constructors, test_str_format_constructor) { "y-m-d", "1985.3.12", 1985, 3, 12}, { "y-m-d", "3'6'8", 2003, 6, 8}, { "y-m-d", "20130801", 2013, 8, 1}, + { "y-m-d", "2013 Aug 1", 2013, 8, 1}, + { "y-m-d", "2013 Aug 01",2013, 8, 1}, + { "y-m-d", "2013 August 01", 2013, 8, 1}, + { "y-m-d", "2013-August-1", 2013, 8, 1}, + { "y-m-d", "2009/Nov/04",2009, 11, 4}, + { "y-m-d","1985.Mar.12", 1985, 3, 12}, { "d-m-y", "01-08-2013", 2013, 8, 1}, { "d-m-y", "01-8-2013", 2013, 8, 1}, { "d-m-y", "1-08-2013", 2013, 8, 1}, @@ -101,6 +107,9 @@ TEST(gnc_date_constructors, test_str_format_constructor) { "d-m-y", "12.3.1985", 1985, 3, 12}, { "d-m-y", "8'6'3", 2003, 6, 8}, { "d-m-y", "01082013", 2013, 8, 1}, + { "d-m-y", "1 Aug 2013", 2013, 8, 1}, + { "d-m-y", "1 Sep 2013", 2013, 9, 1}, + { "d-m-y", "1 September 2013", 2013, 9, 1}, { "m-d-y", "08-01-2013", 2013, 8, 1}, { "m-d-y", "8-01-2013", 2013, 8, 1}, { "m-d-y", "08-1-2013", 2013, 8, 1}, @@ -113,6 +122,8 @@ TEST(gnc_date_constructors, test_str_format_constructor) { "m-d-y", "3.12.1985", 1985, 3, 12}, { "m-d-y", "6'8'3", 2003, 6, 8}, { "m-d-y", "08012013", 2013, 8, 1}, + { "m-d-y", "November 4, 2009", 2009, 11, 4}, + { "m-d-y", "Nov 4, 2009", 2009, 11, 4}, { "d-m", "01-08", curr_year, 8, 1}, { "d-m", "01-8", curr_year, 8, 1}, { "d-m", "1-08", curr_year, 8, 1}, @@ -130,6 +141,29 @@ TEST(gnc_date_constructors, test_str_format_constructor) { "m-d", "6'8", curr_year, 6, 8}, { "m-d", "0801", curr_year, 8, 1}, + // invalid dates + { "d-m-y", "0 Aug 2013", -1, -1, -1}, + { "d-m-y", "31 Sep 2013", -1, -1, -1}, + { "d-m-y", "31 September 2013", -1, -1, -1}, + { "d-m-y", "31/11/2009", -2, -2, -2}, + { "d-m-y", "34.3.1985", -2, -2, -2}, + { "m-d-y", "November 41, 2009", -1, -1, -1}, + { "m-d-y", "Nov 31, 2009", -1, -1, -1}, + { "y-m-d", "2013 Aug 0", -1, -1, -1}, + { "y-m-d", "2013 Feb 30", -1, -1, -1}, + { "y-m-d", "2013 August 0", -1, -1, -1}, + { "y-m-d", "2013-June-31", -1, -1, -1}, + { "y-m-d", "2009/Nov/0", -1, -1, -1}, + { "y-m-d", "1985.Mar.32", -1, -1, -1}, + + // 2-digit dates are not parsable with months as words + { "d-m-y", "1 Sep 13", -1, -1, -1}, + { "d-m-y", "1 September 13", -1, -1, -1}, + { "m-d-y", "November 4, 24", -1, -1, -1}, + { "m-d-y", "Nov 4, 23", -1, -1, -1}, + { "m-d-y", "Nov 29, 24", -1, -1, -1}, + { "y-m-d", "13-June-11", -1, -1, -1}, + // ambiguous date formats // current parser doesn't know how to disambiguate // and hence refuses to parse @@ -186,6 +220,10 @@ TEST(gnc_date_constructors, test_str_format_constructor) { got_year = got_month = got_day = -1; } + catch (const std::out_of_range&) + { + got_year = got_month = got_day = -2; + } EXPECT_TRUE ((got_year == test_dates[i].exp_year) && (got_month == test_dates[i].exp_month) &&