Skip to content

Commit

Permalink
[gnc-datetime] improve CSV date parser with ICU and boost
Browse files Browse the repository at this point in the history
1. Add dateformat "Locale" with ICU; uses current locale for date
   parsing. ICU's locale date parser may parse "3 May 2023" or
   "2024年9月13日" (LC_TIME=zh_TW.utf8) and maybe others.

2. Augment d-m-y m-d-y and y-m-d with boost UK/US/ISO parsers. This allows
   CSV import of dates with months as words as "30 Sep 2023" or
   "May 4, 1978" or "2023-Dec-25". Note boost parser cannot recognise
   2-digit years, therefore "30 Sep 24" is invalid.
  • Loading branch information
christopherlam committed Sep 14, 2024
1 parent c7b55d4 commit db7cf7c
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 2 deletions.
87 changes: 85 additions & 2 deletions libgnucash/engine/gnc-datetime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
#include <boost/date_time/local_time/local_time.hpp>
#include <boost/locale.hpp>
#include <boost/regex.hpp>
#include <unicode/smpdtfmt.h>
#include <unicode/locid.h>
#include <unicode/udat.h>
#include <unicode/parsepos.h>
#include <unicode/calendar.h>
#include <libintl.h>
#include <locale.h>
#include <map>
Expand Down Expand Up @@ -70,6 +75,8 @@ static const TZ_Ptr utc_zone(new boost::local_time::posix_time_zone("UTC-0"));
void _set_tzp(TimeZoneProvider& tz);
void _reset_tzp();

static Date gregorian_date_from_locale_string (const std::string& str);

/* To ensure things aren't overly screwed up by setting the nanosecond clock for boost::date_time. Don't do it, though, it doesn't get us anything and slows down the date/time library. */
#ifndef BOOST_DATE_TIME_HAS_NANOSECONDS
static constexpr auto ticks_per_second = INT64_C(1000000);
Expand All @@ -78,14 +85,15 @@ static constexpr auto ticks_per_second = INT64_C(1000000000);
#endif

/* Vector of date formats understood by gnucash and corresponding regex
* to parse each from an external source
* and/or string->gregorian_date to parse each from an external source
* Note: while the format names are using a "-" as separator, the
* regexes will accept any of "-/.' " and will also work for dates
* without separators.
*/
const std::vector<GncDateFormat> GncDate::c_formats ({
GncDateFormat {
N_("y-m-d"),
boost::gregorian::from_string,
"(?:" // either y-m-d
"(?<YEAR>[0-9]+)[-/.' ]+"
"(?<MONTH>[0-9]+)[-/.' ]+"
Expand All @@ -98,6 +106,7 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
},
GncDateFormat {
N_("d-m-y"),
boost::gregorian::from_uk_string,
"(?:" // either d-m-y
"(?<DAY>[0-9]+)[-/.' ]+"
"(?<MONTH>[0-9]+)[-/.' ]+"
Expand All @@ -110,6 +119,7 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
},
GncDateFormat {
N_("m-d-y"),
boost::gregorian::from_us_string,
"(?:" // either m-d-y
"(?<MONTH>[0-9]+)[-/.' ]+"
"(?<DAY>[0-9]+)[-/.' ]+"
Expand Down Expand Up @@ -145,7 +155,8 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
"(?<DAY>[0-9]{2})"
"(?<YEAR>[0-9]+)?"
")"
}
},
GncDateFormat { N_("Locale"), gregorian_date_from_locale_string },
});

/** Private implementation of GncDateTime. See the documentation for that class.
Expand Down Expand Up @@ -607,6 +618,65 @@ GncDateTimeImpl::timestamp()
return str.substr(0, 8) + str.substr(9, 15);
}

struct ICUResources
{
std::unique_ptr<icu::DateFormat> formatter;
std::unique_ptr<icu::Calendar> calendar;
};

static ICUResources&
get_icu_resources()
{
static ICUResources rv;

if (!rv.formatter)
{
icu::Locale locale;
if (auto lc_time_locale = setlocale (LC_TIME, nullptr))
{
std::string localeStr(lc_time_locale);
if (size_t dotPos = localeStr.find('.'); dotPos != std::string::npos)
localeStr = localeStr.substr(0, dotPos);

locale = icu::Locale::createCanonical (localeStr.c_str());
}

rv.formatter.reset(icu::DateFormat::createDateInstance(icu::DateFormat::kDefault, locale));
if (!rv.formatter)
throw std::invalid_argument("Cannot create date formatter.");

UErrorCode status = U_ZERO_ERROR;
rv.calendar.reset(icu::Calendar::createInstance(locale, status));
if (U_FAILURE(status))
throw std::invalid_argument("Cannot create calendar instance.");

rv.calendar->setLenient(false);
}

return rv;
}

Date
static gregorian_date_from_locale_string (const std::string& str)
{
ICUResources& resources = get_icu_resources();

icu::UnicodeString input = icu::UnicodeString::fromUTF8(str);
icu::ParsePosition parsePos;
UDate date = resources.formatter->parse(input, parsePos);
if (parsePos.getErrorIndex() != -1 || parsePos.getIndex() != input.length())
throw std::invalid_argument ("Cannot parse string");

UErrorCode status = U_ZERO_ERROR;
resources.calendar->setTime(date, status);
if (U_FAILURE(status))
throw std::invalid_argument ("Cannot set calendar time");

return Date (resources.calendar->get(UCAL_YEAR, status),
resources.calendar->get(UCAL_MONTH, status) + 1,
resources.calendar->get(UCAL_DATE, status));
}

/* Member function definitions for GncDateImpl.
*/
GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) :
Expand All @@ -617,6 +687,19 @@ GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) :
if (iter == GncDate::c_formats.cend())
throw std::invalid_argument(N_("Unknown date format specifier passed as argument."));

if (iter->m_str_to_date)
{
try
{
m_greg = (*iter->m_str_to_date)(str);
return;
}
catch (...) {} // with any string->date exception, try regex
}

if (iter->m_re.empty())
throw std::invalid_argument ("No regex pattern available");

boost::regex r(iter->m_re);
boost::smatch what;
if(!boost::regex_search(str, what, r)) // regex didn't find a match
Expand Down
11 changes: 11 additions & 0 deletions libgnucash/engine/gnc-datetime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
#include <memory>
#include <string>
#include <vector>
#include <functional>
#include <optional>

#include <boost/date_time/gregorian/gregorian.hpp>

typedef struct
{
Expand Down Expand Up @@ -172,6 +176,8 @@ class GncDateTime
* GncDate::c_formats class variable and work with those.
*/

using StringToDate = std::function<boost::gregorian::date(const std::string&)>;

class GncDateFormat
{
public:
Expand All @@ -182,13 +188,18 @@ class GncDateFormat
*/
GncDateFormat (const char* fmt, const char* re) :
m_fmt(fmt), m_re(re) {}
GncDateFormat (const char* fmt, StringToDate str_to_date, const char* re) :
m_fmt(fmt), m_re(re), m_str_to_date(str_to_date) {}
GncDateFormat (const char* fmt, StringToDate str_to_date) :
m_fmt(fmt), m_str_to_date(str_to_date) {}
/** A string representing the format. */
const std::string m_fmt;
private:
/** Regular expression associated with the format string. This is to and
* only be used internally by the gnc-datetime code.
*/
const std::string m_re;
std::optional<StringToDate> m_str_to_date;

friend class GncDateImpl;
};
Expand Down
38 changes: 38 additions & 0 deletions libgnucash/engine/test/gtest-gnc-datetime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ TEST(gnc_date_constructors, test_str_format_constructor)
{ "y-m-d", "1985.3.12", 1985, 3, 12},
{ "y-m-d", "3'6'8", 2003, 6, 8},
{ "y-m-d", "20130801", 2013, 8, 1},
{ "y-m-d", "2013 Aug 1", 2013, 8, 1},
{ "y-m-d", "2013 Aug 01",2013, 8, 1},
{ "y-m-d", "2013 August 01", 2013, 8, 1},
{ "y-m-d", "2013-August-1", 2013, 8, 1},
{ "y-m-d", "2009/Nov/04",2009, 11, 4},
{ "y-m-d","1985.Mar.12", 1985, 3, 12},
{ "d-m-y", "01-08-2013", 2013, 8, 1},
{ "d-m-y", "01-8-2013", 2013, 8, 1},
{ "d-m-y", "1-08-2013", 2013, 8, 1},
Expand All @@ -101,6 +107,9 @@ TEST(gnc_date_constructors, test_str_format_constructor)
{ "d-m-y", "12.3.1985", 1985, 3, 12},
{ "d-m-y", "8'6'3", 2003, 6, 8},
{ "d-m-y", "01082013", 2013, 8, 1},
{ "d-m-y", "1 Aug 2013", 2013, 8, 1},
{ "d-m-y", "1 Sep 2013", 2013, 9, 1},
{ "d-m-y", "1 September 2013", 2013, 9, 1},
{ "m-d-y", "08-01-2013", 2013, 8, 1},
{ "m-d-y", "8-01-2013", 2013, 8, 1},
{ "m-d-y", "08-1-2013", 2013, 8, 1},
Expand All @@ -113,6 +122,8 @@ TEST(gnc_date_constructors, test_str_format_constructor)
{ "m-d-y", "3.12.1985", 1985, 3, 12},
{ "m-d-y", "6'8'3", 2003, 6, 8},
{ "m-d-y", "08012013", 2013, 8, 1},
{ "m-d-y", "November 4, 2009", 2009, 11, 4},
{ "m-d-y", "Nov 4, 2009", 2009, 11, 4},
{ "d-m", "01-08", curr_year, 8, 1},
{ "d-m", "01-8", curr_year, 8, 1},
{ "d-m", "1-08", curr_year, 8, 1},
Expand All @@ -130,6 +141,29 @@ TEST(gnc_date_constructors, test_str_format_constructor)
{ "m-d", "6'8", curr_year, 6, 8},
{ "m-d", "0801", curr_year, 8, 1},

// invalid dates
{ "d-m-y", "0 Aug 2013", -1, -1, -1},
{ "d-m-y", "31 Sep 2013", -1, -1, -1},
{ "d-m-y", "31 September 2013", -1, -1, -1},
{ "d-m-y", "31/11/2009", -2, -2, -2},
{ "d-m-y", "34.3.1985", -2, -2, -2},
{ "m-d-y", "November 41, 2009", -1, -1, -1},
{ "m-d-y", "Nov 31, 2009", -1, -1, -1},
{ "y-m-d", "2013 Aug 0", -1, -1, -1},
{ "y-m-d", "2013 Feb 30", -1, -1, -1},
{ "y-m-d", "2013 August 0", -1, -1, -1},
{ "y-m-d", "2013-June-31", -1, -1, -1},
{ "y-m-d", "2009/Nov/0", -1, -1, -1},
{ "y-m-d", "1985.Mar.32", -1, -1, -1},

// 2-digit dates are not parsable with months as words
{ "d-m-y", "1 Sep 13", -1, -1, -1},
{ "d-m-y", "1 September 13", -1, -1, -1},
{ "m-d-y", "November 4, 24", -1, -1, -1},
{ "m-d-y", "Nov 4, 23", -1, -1, -1},
{ "m-d-y", "Nov 29, 24", -1, -1, -1},
{ "y-m-d", "13-June-11", -1, -1, -1},

// ambiguous date formats
// current parser doesn't know how to disambiguate
// and hence refuses to parse
Expand Down Expand Up @@ -186,6 +220,10 @@ TEST(gnc_date_constructors, test_str_format_constructor)
{
got_year = got_month = got_day = -1;
}
catch (const std::out_of_range&)
{
got_year = got_month = got_day = -2;
}

EXPECT_TRUE ((got_year == test_dates[i].exp_year) &&
(got_month == test_dates[i].exp_month) &&
Expand Down

0 comments on commit db7cf7c

Please sign in to comment.