-
Notifications
You must be signed in to change notification settings - Fork 4
/
trovereader.py
56 lines (46 loc) · 1.81 KB
/
trovereader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Code for reading data from Trove archive files in XML format
Trove archives are XML files, potentially very large with one <record> element
per record or document. We provide a class that can be used to iterate
over the records in a file as Python dictionaries.
"""
from typing import Dict, List
import pandas as pd
from lxml import etree
def trove_to_dataframe(xmlfile: str) -> pd.DataFrame:
df = pd.DataFrame(trove_parser(xmlfile))
# drop some unwanted columns
df.drop(['bibliographicCitation', 'coverage', 'format', 'language', 'metadataSource', 'type'], axis=1, inplace=True)
return df
def trove_parser(xmlfile: str) -> List:
"""Read records corresponding to documents from an XML
file exported from Trove, yield records one at
a time to the caller.
Each record is a dictionary with metadata properties.
All dictionary values are lists.
"""
context = etree.iterparse(xmlfile, events=('end',), tag='record')
for event, elem in context:
record = {}
for child in elem:
if child.text:
if child.tag == 'identifier' and 'linktype' in child.attrib:
propname = child.attrib['linktype']
else:
propname = child.tag
if propname in record:
record[propname].append(child.text.strip())
else:
record[propname] = [child.text.strip()]
for key in record:
record[key] = record[key][0]
yield record
if __name__=='__main__':
#xmlfile = "data/nla-advocate-sample.xml"
xmlfile = "data/nla.obj-573721295_Aborginies_Advocate.xml"
i = 0
for record in trove_parser(xmlfile):
print(record['identifier'][0], len(record['description'][0]))
i += 1
if i > 3:
break