Skip to content

Commit 0ea52c7

Browse files
authored
PA Parser [SDESK-7427] (#2851)
* PA Parser [SDESK-7427] * remove unwanted fields * revert priority and embargo * refactore code
1 parent 22fe3f6 commit 0ea52c7

File tree

4 files changed

+434
-0
lines changed

4 files changed

+434
-0
lines changed

superdesk/io/feed_parsers/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -354,3 +354,4 @@ class EmailFeedParser(FeedParser, metaclass=ABCMeta):
354354
from superdesk.io.feed_parsers.ap_media import APMediaFeedParser # NOQA
355355
from superdesk.io.feed_parsers.dpa_newsml import DPAFeedParser # NOQA
356356
from superdesk.io.feed_parsers.afp_newsml_1_2_new import AFPNewsMLFeedParser # NOQA
357+
from superdesk.io.feed_parsers.pa_parser import PAParser # NOQA
+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# -*- coding: utf-8; -*-
2+
#
3+
# This file is part of Superdesk.
4+
#
5+
# Copyright 2025 Sourcefabric z.u. and contributors.
6+
#
7+
# For the full copyright and license information, please see the
8+
# AUTHORS and LICENSE files distributed with this source code, or
9+
# at https://www.sourcefabric.org/superdesk/license
10+
11+
from datetime import datetime
12+
from superdesk.utc import utc
13+
from superdesk.etree import etree
14+
from superdesk.io.registry import register_feed_parser
15+
from superdesk.io.feed_parsers.nitf import NITFFeedParser
16+
17+
18+
class PAParser(NITFFeedParser):
19+
"""
20+
Feed Parser for PA (Press Association) XML files.
21+
"""
22+
23+
NAME = "pa_parser"
24+
label = "PA Parser"
25+
26+
def can_parse(self, xml):
27+
"""
28+
Check if the XML can be parsed by this parser.
29+
"""
30+
return xml.tag == "document" and xml.find("nitf") is not None
31+
32+
def parse(self, xml, provider=None):
33+
"""
34+
Parse the XML and return a single dictionary representing the news item.
35+
"""
36+
item = {}
37+
nitf = xml.find("nitf")
38+
if nitf is not None:
39+
self.parse_head(nitf, item)
40+
self.parse_body(nitf, item)
41+
self.parse_resource(xml, item)
42+
return item
43+
44+
def parse_head(self, nitf, item):
45+
"""
46+
Parse the head section of the NITF document.
47+
"""
48+
head = nitf.find("head")
49+
if head is not None:
50+
title = head.find("title")
51+
if title is not None and title.text:
52+
item["headline"] = title.text
53+
54+
def parse_body(self, nitf, item):
55+
"""
56+
Parse the body section of the NITF document.
57+
"""
58+
body = nitf.find("body")
59+
if body is not None:
60+
self.parse_body_head(body, item)
61+
self.parse_body_content(body, item)
62+
63+
def parse_body_head(self, body, item):
64+
"""
65+
Parse the body.head section of the NITF document.
66+
"""
67+
body_head = body.find("body.head")
68+
if body_head is not None:
69+
self.parse_hedline(body_head, item)
70+
self.parse_byline(body_head, item)
71+
72+
def parse_hedline(self, body_head, item):
73+
"""
74+
Parse the hedline section of the NITF document.
75+
"""
76+
hedline = body_head.find("hedline")
77+
if hedline is not None:
78+
hl1 = hedline.find("hl1")
79+
if hl1 is not None and hl1.text:
80+
item["headline"] = hl1.text
81+
82+
def parse_byline(self, body_head, item):
83+
"""
84+
Parse the byline section of the NITF document.
85+
"""
86+
byline = body_head.find("byline")
87+
if byline is not None:
88+
bytag = byline.find("bytag")
89+
if bytag is not None and bytag.text:
90+
item["byline"] = bytag.text
91+
92+
def parse_body_content(self, body, item):
93+
"""
94+
Parse the body.content section of the NITF document and clean up HTML content.
95+
"""
96+
body_content = body.find("body.content")
97+
if body_content is not None:
98+
body_html = etree.tostring(body_content, encoding="unicode", method="html")
99+
parser = etree.HTMLParser()
100+
tree = etree.fromstring(body_html, parser)
101+
etree.strip_tags(tree, "section", "body", "span", "body.content")
102+
cleaned_html = "".join(etree.tostring(child, encoding="unicode", method="html") for child in tree)
103+
item["body_html"] = cleaned_html
104+
105+
def parse_resource(self, xml, item):
106+
"""
107+
Parse the Resource section of the XML document.
108+
"""
109+
resource = xml.find(".//xn:Resource", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"})
110+
if resource is not None:
111+
for vendor_data in resource.findall(
112+
".//xn:vendorData", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"}
113+
):
114+
if vendor_data.text and "PRESSUK_:Document ID=" in vendor_data.text:
115+
document_id = vendor_data.text.split("PRESSUK_:Document ID=")[-1].strip()
116+
if document_id:
117+
item["guid"] = document_id
118+
break
119+
self.parse_versioncreated(resource, item)
120+
self.parse_firstcreated(resource, item)
121+
self.parse_abstract(resource, item)
122+
self.parse_usageterms(resource, item)
123+
self.parse_word_count(resource, item)
124+
self.parse_keywords(resource, item)
125+
self.parse_embargo(resource, item)
126+
self.parse_priority(resource, item)
127+
128+
def parse_versioncreated(self, resource, item):
129+
"""
130+
Parse the versioncreated timestamp from the Resource section.
131+
"""
132+
publication_time = resource.find(
133+
"xn:publicationTime", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"}
134+
)
135+
if publication_time is not None and publication_time.text:
136+
item["versioncreated"] = datetime.strptime(publication_time.text, "%Y-%m-%dT%H:%M:%S+00:00").replace(
137+
tzinfo=utc
138+
)
139+
140+
def parse_firstcreated(self, resource, item):
141+
"""
142+
Parse the firstcreated timestamp from the Resource section.
143+
"""
144+
received_time = resource.find("xn:receivedTime", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"})
145+
if received_time is not None and received_time.text:
146+
item["firstcreated"] = datetime.strptime(received_time.text, "%Y-%m-%dT%H:%M:%S+00:00").replace(tzinfo=utc)
147+
148+
def parse_abstract(self, resource, item):
149+
"""
150+
Parse the abstract from the Resource section.
151+
"""
152+
description = resource.find("xn:description", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"})
153+
if description is not None and description.text:
154+
item["abstract"] = description.text
155+
156+
def parse_usageterms(self, resource, item):
157+
"""
158+
Parse the usage terms (copyright) from the Resource section.
159+
"""
160+
copyright = resource.find("xn:copyright", namespaces={"xn": "http://www.xmlnews.org/namespaces/meta#"})
161+
if copyright is not None and copyright.text:
162+
item["usageterms"] = copyright.text
163+
164+
def parse_embargo(self, resource, item):
165+
"""
166+
Parse the embargo timestamp from the Resource section.
167+
"""
168+
for vendor_data in resource.findall("{http://www.xmlnews.org/namespaces/meta#}vendorData"):
169+
if vendor_data.text and "PRESSUK_:Expiration Date=" in vendor_data.text:
170+
embargo = vendor_data.text.split("PRESSUK_:Expiration Date=")[-1].strip()
171+
if embargo:
172+
try:
173+
embargo_dt = datetime.strptime(embargo, "%Y-%m-%dT%H:%M:%S%z")
174+
item["embargo"] = embargo_dt.isoformat()
175+
except ValueError:
176+
pass
177+
178+
def parse_word_count(self, resource, item):
179+
for vendor_data in resource.findall("{http://www.xmlnews.org/namespaces/meta#}vendorData"):
180+
if vendor_data.text and "PRESSUK_:Word Count=" in vendor_data.text:
181+
word_count = vendor_data.text.split("PRESSUK_:Word Count=")[-1].strip()
182+
if word_count:
183+
try:
184+
item["word_count"] = int(word_count)
185+
except ValueError:
186+
pass
187+
188+
def parse_priority(self, resource, item):
189+
for vendor_data in resource.findall("{http://www.xmlnews.org/namespaces/meta#}vendorData"):
190+
if vendor_data.text and "PRESSUK_:PA Priority=" in vendor_data.text:
191+
priority = vendor_data.text.split("PRESSUK_:PA Priority=")[-1].strip()
192+
if priority:
193+
try:
194+
item["priority"] = int(priority)
195+
except ValueError:
196+
pass
197+
198+
def parse_keywords(self, resource, item):
199+
keywords = []
200+
for vendor_data in resource.findall("{http://www.xmlnews.org/namespaces/meta#}vendorData"):
201+
if vendor_data.text and "PRESSUK_:Keyword=" in vendor_data.text:
202+
keyword = vendor_data.text.split("PRESSUK_:Keyword=")[-1].strip()
203+
if keyword:
204+
keywords.append(keyword)
205+
if keywords:
206+
item["keywords"] = keywords
207+
208+
209+
register_feed_parser(PAParser.NAME, PAParser())
+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# -*- coding: utf-8; -*-
2+
#
3+
# This file is part of Superdesk.
4+
#
5+
# Copyright 2025 Sourcefabric z.u. and contributors.
6+
#
7+
# For the full copyright and license information, please see the
8+
# AUTHORS and LICENSE files distributed with this source code, or
9+
# at https://www.sourcefabric.org/superdesk/license
10+
11+
import os
12+
from superdesk.etree import etree
13+
from superdesk.tests import TestCase
14+
from superdesk.io.feed_parsers.pa_parser import PAParser
15+
16+
17+
class PAParserTestCase(TestCase):
18+
"""
19+
Test case for the PA Parser.
20+
"""
21+
22+
def setUp(self):
23+
super().setUp()
24+
self.dirname = os.path.dirname(os.path.realpath(__file__))
25+
self.fixture = os.path.normpath(os.path.join(self.dirname, "../fixtures/pa_parser.xml"))
26+
self.provider = {"name": "Test"}
27+
with open(self.fixture, "rb") as f:
28+
xml = etree.parse(f)
29+
self.item = PAParser().parse(xml.getroot(), self.provider)
30+
31+
def test_headline(self):
32+
"""
33+
Test if the headline is correctly parsed.
34+
"""
35+
self.assertEqual(
36+
self.item.get("headline"),
37+
"MORE THAN FOUR IN FIVE SCHOOL LEADERS ABUSED BY PARENTS IN PAST YEAR - SURVEY",
38+
)
39+
40+
def test_byline(self):
41+
"""
42+
Test if the byline is correctly parsed.
43+
"""
44+
self.assertEqual(
45+
self.item.get("byline"),
46+
"By Eleanor Busby, PA Education Correspondent",
47+
)
48+
49+
def test_versioncreated(self):
50+
"""
51+
Test if the versioncreated timestamp is correctly parsed.
52+
"""
53+
self.assertEqual(
54+
self.item.get("versioncreated").isoformat(),
55+
"2025-03-04T00:01:00+00:00",
56+
)
57+
58+
def test_firstcreated(self):
59+
"""
60+
Test if the firstcreated timestamp is correctly parsed.
61+
"""
62+
self.assertEqual(
63+
self.item.get("firstcreated").isoformat(),
64+
"2025-03-03T02:45:45+00:00",
65+
)
66+
67+
def test_abstract(self):
68+
"""
69+
Test if the abstract is correctly parsed.
70+
"""
71+
self.assertEqual(
72+
self.item.get("abstract"),
73+
"The majority of school leaders have reported being abused by parents in the past year, a survey has suggested.",
74+
)
75+
76+
def test_keywords(self):
77+
"""
78+
Test if the keywords are correctly parsed.
79+
"""
80+
self.assertIn("EDUCATION", self.item.get("keywords"))
81+
82+
def test_word_count(self):
83+
"""
84+
Test if the word count is correctly parsed.
85+
"""
86+
self.assertEqual(self.item.get("word_count"), 749)
87+
88+
def test_body_html(self):
89+
"""
90+
Test if the body HTML is correctly parsed.
91+
"""
92+
body_html = self.item.get("body_html")
93+
self.assertTrue(body_html.startswith("<p>The majority of school leaders have reported being abused by parents"))
94+
self.assertIn("<p>More than two in five school leaders (42%)", body_html)
95+
self.assertIn("<p>One in 10 school leaders said they had suffered physical violence", body_html)
96+
97+
def test_usageterms(self):
98+
"""
99+
Test if the usage terms (copyright) are correctly parsed.
100+
"""
101+
self.assertEqual(
102+
self.item.get("usageterms"),
103+
"Press Association 2025",
104+
)

0 commit comments

Comments
 (0)