forked from weltliteratur/dnb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
executable file
·33 lines (27 loc) · 948 Bytes
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import rdflib
import gzip
from rdflib.namespace import DCTERMS, DC
# read Wikidata GND ids of writers
writers = set()
with open('wd_result', 'r') as f:
for line in f:
entity, gndid = line.strip().split()
writers.add(gndid)
# read DNB data
g = rdflib.Graph()
g.parse(gzip.open('DNBTitel.ttl.gz', 'rt'), format='n3')
isbd = rdflib.Namespace("http://iflastandards.info/ns/isbd/elements/")
fout = open('dnb_pages.tsv', 'wt')
for s, o in g.subject_objects(DCTERMS["creator"]):
# property with linked GND id found, extract GND id
url, gndid = o.rsplit('/', 1)
# check whether this is a writer
if gndid in writers:
# get title and page number
title = g.value(s, DC["title"], None)
pages = g.value(s, isbd["P1053"], None)
print(s, gndid, title.encode("utf-8"), pages, sep='\t', file=fout)
fout.close()