From d8830176512ac7796558f82d1d898abce8930c7e Mon Sep 17 00:00:00 2001 From: Aurelien Didier Date: Thu, 27 Jun 2024 16:57:09 +0200 Subject: [PATCH] Update date_extractor.py --- .../extractor/extractors/date_extractor.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/newsplease/pipeline/extractor/extractors/date_extractor.py b/newsplease/pipeline/extractor/extractors/date_extractor.py index fc3b120a..cc2c0773 100644 --- a/newsplease/pipeline/extractor/extractors/date_extractor.py +++ b/newsplease/pipeline/extractor/extractors/date_extractor.py @@ -124,11 +124,17 @@ def _extract_from_meta(self, html): date = meta['content'].strip() break + # + if 'dc.date' == item_prop: + date = meta['content'].strip() + break + # if 'article:published_time' == meta_property: date = meta['content'].strip() break - # + + # if 'date' == meta_name: date = meta['content'].strip() break @@ -182,6 +188,21 @@ def _extract_from_meta(self, html): date = meta['content'].strip() break + # + if 'dcterms.date' == item_prop: + date = meta['content'].strip() + break + + # + if 'dcterms.created' == item_prop: + date = meta['content'].strip() + break + + # + if 'og:published_time datetime' == item_prop: + date = meta['content'].strip() + break + # if 'og:image' == meta_property or "image" == item_prop: