Skip to content

Commit

Permalink
capturing identifier from header for dublincore parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Mugdha Polimera authored and Mugdha Polimera committed Oct 1, 2024
1 parent 9bd6801 commit a86ef36
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 3 deletions.
6 changes: 6 additions & 0 deletions adsingestp/parsers/dubcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ def _parse_ids(self):
self.base_metadata["ids"] = {}
self.base_metadata["ids"]["pub-id"] = []

if self.input_header.find("identifier"):
for dc_id in self.input_header.find_all("identifier"):
self.base_metadata["ids"]["pub-id"].append(
{"attribute": "publisher-id", "Identifier": dc_id.get_text()}
)

if self.input_metadata.find("dc:identifier"):
for dc_id in self.input_metadata.find_all("dc:identifier"):
self.base_metadata["ids"]["pub-id"].append(
Expand Down
4 changes: 4 additions & 0 deletions tests/stubdata/output/arxiv_0901_2443.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@
"pubYear": "2009"
},
"publisherIDs": [
{
"Identifier": "oai:arXiv.org:0901.2443",
"attribute": "publisher-id"
},
{
"Identifier": "http://arxiv.org/abs/0901.2443",
"attribute": "publisher-id"
Expand Down
4 changes: 4 additions & 0 deletions tests/stubdata/output/arxiv_1711_04702.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@
"pubYear": "2017"
},
"publisherIDs": [
{
"Identifier": "oai:arXiv.org:1711.04702",
"attribute": "publisher-id"
},
{
"Identifier": "http://arxiv.org/abs/1711.04702",
"attribute": "publisher-id"
Expand Down
4 changes: 4 additions & 0 deletions tests/stubdata/output/arxiv_1711_05739.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
"pubYear": "2017"
},
"publisherIDs": [
{
"Identifier": "oai:arXiv.org:1711.05739",
"attribute": "publisher-id"
},
{
"Identifier": "http://arxiv.org/abs/1711.05739",
"attribute": "publisher-id"
Expand Down
4 changes: 4 additions & 0 deletions tests/stubdata/output/arxiv_math_0306266.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
"pubYear": "2003"
},
"publisherIDs": [
{
"Identifier": "oai:arXiv.org:math/0306266",
"attribute": "publisher-id"
},
{
"Identifier": "http://arxiv.org/abs/math/0306266",
"attribute": "publisher-id"
Expand Down
4 changes: 4 additions & 0 deletions tests/stubdata/output/dubcore_pos_ecrs_002.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
"publisher": "Sissa Medialab"
},
"publisherIDs": [
{
"Identifier": "oai:pos.sissa.it:ECRS/002",
"attribute": "publisher-id"
},
{
"Identifier": "PoS(ECRS)002",
"attribute": "publisher-id"
Expand Down
8 changes: 5 additions & 3 deletions tests/test_dublincore.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ def test_dubcore(self):
with open(test_infile, "rb") as fp:
input_data = fp.read()

parsed = parser.parse(input_data)

with open(test_outfile, "rb") as fp:
output_text = fp.read()
output_data = json.loads(output_text)

parsed = parser.parse(input_data)

# make sure this is valid schema
try:
ads_schema_validator().validate(parsed)
Expand All @@ -53,6 +53,7 @@ def test_dubcore(self):
self.assertTrue(abs(time_difference) < datetime.timedelta(seconds=10))
parsed["recordData"]["parsedTime"] = ""

self.maxDiff = None
self.assertEqual(parsed, output_data)


Expand All @@ -77,11 +78,12 @@ def test_dubcore_multi(self):
with open(test_infile, "r") as fp:
input_data = fp.read()

parsed = parser.parse(input_data, header=True)

with open(test_outfile_header, "r") as fp:
output_text = fp.read()
output_data_header = output_text.strip().split("\n\n")

parsed = parser.parse(input_data, header=True)
self.assertEqual(parsed, output_data_header)

with open(test_outfile_noheader, "r") as fp:
Expand Down

0 comments on commit a86ef36

Please sign in to comment.