diff --git a/scraper/scraper.go b/scraper/scraper.go index 4afbdec..cf04683 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -370,6 +370,12 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string } } + c.guessYear(items, time.Now()) + + return items, nil +} + +func (c *Scraper) guessYear(items []map[string]interface{}, ref time.Time) { // get date field names where we need to adapt the year dateFieldsGuessYear := map[string]bool{} for _, f := range c.Fields { @@ -384,46 +390,39 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string // event websites mostly contain a list of events ordered by date. Sometimes the date does // not contain the year. In that case we could simply set the year to the current year but // it might happen that the list of events spans across more than one year into the next - // year. In that case we still want to set the correct year which would be current year + 1. + // year. In that case we still want to set the correct year which would be current year + n. + // Moreover, the list might not be ordered at all. In that case we also want to try to set + // the correct year. if len(dateFieldsGuessYear) > 0 { for i, item := range items { for name, val := range item { if dateFieldsGuessYear[name] { if t, ok := val.(time.Time); ok { - now := time.Now() - yesterday := now.AddDate(0, 0, -1) - // we compare the date with yesterday, not now, to accomodate for the fact that at the time we scrape - // the event might have already taken place but not yet removed from the website. Let's see if 1 day - // is a reasonable margin. - if t.Before(yesterday) { - newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - item[name] = newT - continue - } + + // for the first item we compare this item's date with 'now' and try + // to find the most suitable year, ie the year that brings this item's + // date closest to now. + // for the remaining items we do the same as with the first item except + // that we compare this item's date to the previous item's date instead + // of 'now'. if i > 0 { - if prevT, ok := items[i-1][name].(time.Time); ok { - // here we do not compare the current date directly to the previous date. There - // are cases where we wouldn't want the year to be increased by one even though - // the previous date is bigger than the current one. Such cases occur when a - // website contains a list of items that are sorted by date but within a day are - // not sorted by time. To prevent the year from being increased wrongly in that - // case we introduce a min delta of 1 day. - tmpT := prevT.AddDate(0, 0, -1) - if t.Before(tmpT) { - // probably there is still a bug here when we have a list that spans two years - // changes.. - newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - item[name] = newT - } + ref, _ = items[i-1][name].(time.Time) + } + diff := time.Since(time.Unix(0, 0)) + newDate := t + for y := ref.Year() - 1; y <= ref.Year()+1; y++ { + tmpT := time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) + if newDiff := tmpT.Sub(ref).Abs(); newDiff < diff { + diff = newDiff + newDate = time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) } } + item[name] = newDate } } } } } - - return items, nil } func (c *Scraper) initializeFilters() error { diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index d9e99ca..5552332 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -648,3 +648,196 @@ func TestExtractFieldDate29Feb(t *testing.T) { t.Fatalf("expected '2024' as year of date but got '%d'", dt.Year()) } } + +func TestGuessYearSimple(t *testing.T) { + // items dates span period around change of year + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 30, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } +} + +func TestGuessYearUnordered(t *testing.T) { + // items dates are not perfectly ordered and span + // period around change of year + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } +} + +func TestGuessYear2Years(t *testing.T) { + // items dates span more than 2 years + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 5, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 9, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 2, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 5, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 9, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2025, 2, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } +} + +func TestGuessYearStartBeforeReference(t *testing.T) { + // items date start before given reference + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2024, 1, 30, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } +}