Skip to content

Commit

Permalink
Merge pull request #270 from anteverse/feature/more-dates-in-extractor
Browse files Browse the repository at this point in the history
Add 4 date metadata patterns in date extraction
  • Loading branch information
fhamborg authored Jul 1, 2024
2 parents c7166d7 + d883017 commit 2015e86
Showing 1 changed file with 22 additions and 1 deletion.
23 changes: 22 additions & 1 deletion newsplease/pipeline/extractor/extractors/date_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,17 @@ def _extract_from_meta(self, html):
date = meta['content'].strip()
break

# <meta itemprop="dc.date" content="2015-11-26T11:53:00.000Z" />
if 'dc.date' == item_prop:
date = meta['content'].strip()
break

# <meta property="article:published_time" content="2015-11-25" />
if 'article:published_time' == meta_property:
date = meta['content'].strip()
break
# <meta name="Date" content="2015-11-26" />

# <meta name="Date" content="2015-11-26" />
if 'date' == meta_name:
date = meta['content'].strip()
break
Expand Down Expand Up @@ -182,6 +188,21 @@ def _extract_from_meta(self, html):
date = meta['content'].strip()
break

# <meta itemprop="dcterms.date" content="2015-11-26T11:53:00.000Z" />
if 'dcterms.date' == item_prop:
date = meta['content'].strip()
break

# <meta itemprop="dcterms.created" content="2015-11-26T11:53:00.000Z" />
if 'dcterms.created' == item_prop:
date = meta['content'].strip()
break

# <meta itemprop="dcterms.created" content="2015-11-26T11:53:00.000Z" />
if 'og:published_time datetime' == item_prop:
date = meta['content'].strip()
break

# <meta property="og:image" content="http://www.dailytimes.com.pk/digital
# _images/400/2015-11-26/norway-return-number-of-asylum-seekers-to-pakistan-1448538771-7363.jpg"/>
if 'og:image' == meta_property or "image" == item_prop:
Expand Down

0 comments on commit 2015e86

Please sign in to comment.