Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SAB data #489

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@ source/cc-licenses.ttl: source/construct-cc-license-data.rq

source/sab.ttl: scripts/extract_sab_data_from_docx.py cache/esab-2015_1.docx
python3 $^ | trld -ijsonld -ottl > $@
# TODO 1: enhance with DDC-mappings: scripts/create_sab_skos_data.py +
# ../librisxl/whelk-core/src/main/java/se/kb/libris/export/dewey/dewey_sab.txt
# TODO 2: In XL, add precomposed usages (extract from usage in records)? See:
# ../librisxl/marc_export/src/main/resources/se/kb/libris/export/sabrub.txt # precomposed
# TODO: enhance with DDC-mappings (if allowed): scripts/create_sab_skos_data.py with
# ../librisxl/whelk-core/src/main/java/se/kb/libris/export/dewey/dewey_sab.txt

source/sab/precoordinated.ttl: scripts/make_precoordinated_sab_terms.py source/sab.ttl cache/sab-usages.tsv.gz
python3 $^ > $@ 2>logs/sab-unknown.tsv
# TODO: Compare extracted precoordinated.ttl usages with:
# ../librisxl/marc_export/src/main/resources/se/kb/libris/export/sabrub.txt # precomposed

cache/sab-usages.tsv.gz: scripts/sab-usages.rq
curl -s https://libris.kb.se/sparql -HAccept:text/tab-separated-values --data-urlencode query@$^ | gzip - > /tmp/sab-usages.tsv.gz
cp /tmp/sab-usages.tsv.gz $@

## SSIF 2011 (Obsolete)
#
Expand Down
92 changes: 54 additions & 38 deletions scripts/create_sab_skos_data.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,48 @@
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import csv
import json
import re
import sys
import urllib
from urllib.parse import quote


SKOS = "http://www.w3.org/2004/02/skos/core#"
SAB_BASE = "https://id.kb.se/def/scheme/sab/{0}"
DDC_BASE = "http://dewey.info/class/{0}/"
KBV = "https://id.kb.se/vocab/"
# SKOS = "http://www.w3.org/2004/02/skos/core#"
SAB_BASE = "https://id.kb.se/term/kssb/{0}"
DDC_BASE = None # "http://dewey.info/class/{0}/"
LANG = 'sv'

hint_link_map = {
#'#H': SKOS+'closeMatch',
#'#G': SKOS+'closeMatch',
#'#M': inverseOf SKOS+'closeMatch',
'#1': SKOS+'exactMatch',
'#2': SKOS+'broadMatch',
'#3': SKOS+'narrowMatch',
'#4': SKOS+'closeMatch',
'DDK22': SKOS+'relatedMatch', # TODO
'#1': 'exactMatch',
'#2': 'broadMatch',
'#3': 'narrowMatch',
'#4': 'closeMatch',
'DDK22': 'relatedMatch', # TODO
}

def create_data(sab_codes_fpath, ddc_map_fpath, limit):

def create_data(ddc_map_fpath, sab_codes_fpath):
rmap = {}
create_sab_skos_data(rmap, sab_codes_fpath, limit=limit)
if sab_codes_fpath:
create_sab_skos_data(rmap, sab_codes_fpath)
create_sab_ddc_data(rmap, ddc_map_fpath)
return {
"@context": {
"@vocab": SKOS,
"prefLabel": {"@language": LANG}
"@vocab": KBV,
"@base": SAB_BASE.format(""),
"prefLabel": {"@language": LANG},
},
"@graph": rmap.values()
"@graph": list(rmap.values()),
}


def to_uri(base, code):
slug = urllib.quote(code.encode('utf-8'), safe=b':(),')
slug = quote(code.encode('utf-8'), safe='') # TODO: safe=':(),' (as in generated sab?)
return base.format(slug)

def create_sab_skos_data(rmap, fpath, limit=0):

def create_sab_skos_data(rmap, fpath):
label_map = {}
pending_broader = []

Expand All @@ -49,53 +52,66 @@ def create_sab_skos_data(rmap, fpath, limit=0):
"@id": uri,
"@type": "Concept",
"notation": code,
"prefLabel": label
"prefLabel": label,
}
label_map[label] = uri
if ': ' in label:
pending_broader.append((r, label.rsplit(': ', 1)[0]))
if limit and i > limit:
break

for r, broader_label in pending_broader:
broader_uri = label_map.get(broader_label)
if broader_uri:
r.setdefault("broader", []).append({"@id": broader_uri})


def create_sab_ddc_data(rmap, fpath):
for number, sab_code, ddc_code, hint in read_csv_items(
fpath, skip_comment=True, coding='utf-8', size=4):
fpath, skip_comment=True, size=4
):
hint = re.split(r'\s+|\w(?=#)', hint)[-1].strip()
link = hint_link_map.get(hint)
if not link:
print >> sys.stderr, "No link map for", hint.encode('utf-8')
print("No link map for", hint, file=sys.stderr)
continue
uri = to_uri(SAB_BASE, sab_code)
uri = to_uri("{}", sab_code)
rmap.setdefault(uri, {"@id": uri}).setdefault(link, []).append(
{"@id": to_uri(DDC_BASE, ddc_code)})
{
"@id": to_uri(DDC_BASE, ddc_code)
} if DDC_BASE else {
"@type": "ClassificationDdc",
"code": ddc_code
}
)


def read_csv_items(fpath, skip_first=True, skip_comment=False,
csv_dialect='excel-tab', coding='latin-1', size=0):
with open(fpath, 'rb') as fp:
def read_csv_items(
fpath,
skip_first=True,
skip_comment=False,
csv_dialect='excel-tab',
encoding='latin-1',
size=0,
):
with open(fpath, 'rt', encoding=encoding) as fp:
reader = csv.reader(fp, csv_dialect)
if skip_first is True:
reader.next()
next(reader)
for row in reader:
if not row or skip_comment and row[0].startswith(b'#'):
if not row or skip_comment and row[0].startswith('#'):
continue
cols = [col.strip().decode(coding) for col in row]
cols = [col.strip() for col in row]
if size and len(cols) > size:
cols = cols[0:size]
yield cols


if __name__ == '__main__':
args = sys.argv[1:]
sab_codes_fpath = args.pop(0)
ddc_map_fpath = args.pop(0)
limit = int(args.pop(0)) if args else 0
sab_codes_fpath = args.pop(0) if args else None

data = create_data(sab_codes_fpath, ddc_map_fpath, limit)
print json.dumps(data,
indent=2, separators=(',', ': '), sort_keys=True, ensure_ascii=False
).encode('utf-8')
data = create_data(ddc_map_fpath, sab_codes_fpath)
s = json.dumps(
data, indent=2, separators=(',', ': '), sort_keys=True, ensure_ascii=False
)
print(s)
26 changes: 17 additions & 9 deletions scripts/extract_sab_data_from_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from zipfile import ZipFile
import json
from lxml import etree
from urllib.parse import quote
from urllib.parse import quote, unquote


NS = {'w': "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
Expand Down Expand Up @@ -61,32 +61,40 @@ def handle_row(self, section, level, parts):

def handle_main_row(self, level, parts):
current = self._stack[-1]

node = self._make_node(parts, current=current)
if node is None:
return

code = node['code']

dangling = False

if node['@type'].endswith('Subdivision'):
current.setdefault('element', []).append(node)
return
elif not level and not len(code) == 1:
return
# FIXME: we seem to lose parent 'B' from e.g. ':a' in HJÄLPTABELLER
dangling = True

if node['@type'].endswith('Collection'):
self._current_coll = [ node[ID] ] + code.split('--')
# NOTE: we "incorrectly" link the collection to the parent
# classification, instead of:
#return
elif self._current_coll:
# FIXME: _current_coll needs to be like _stack ...
# TODO 668fe2a3: _current_coll needs to be like _stack ...
coll_id, coll_start, coll_end = self._current_coll
if (len(code) == len(coll_start) and
code >= coll_start and code <= coll_end):
node['inCollection'] = {ID: coll_id}

if len(code) > len(current['code']):
# "Deeper" Collection nodes got lost otherwise (see FIXME though)
if dangling:
prev_parent = self._stack[-1]
if node['code'].startswith(prev_parent['code']):
prev_parent.setdefault('narrower', []).append(node)
elif len(code) > len(current['code']):
# "Deeper" Collection nodes got lost otherwise (but see 668fe2a3)
if node['@type'].endswith('Collection'):
self.helptable.append(node)
elif code.startswith(current['code']):
Expand Down Expand Up @@ -148,14 +156,14 @@ def _make_node(self, parts, element_type=None, current=None):

node['inScheme'] = {ID: f'/term/{SAB_CODE}'}

node_id = "%s" % quote(code.encode('utf-8'), safe=b'')
node_id = "%s" % quote(code, safe='')

# NOTE: Many local elements are similar to their top-level element, but
# far from all (and special '.0' elements are always locally unique).
if current and element_type:
if code[0:2] != '.0':
node['broader'] = {ID: node_id}
node_id = current[ID] + code
node_id = quote(unquote(current[ID]) + code)

node[ID] = node_id

Expand Down Expand Up @@ -330,7 +338,7 @@ def extract_sab(doc, debug=False):
assert not indent
in_section = HEADING_MAP.get(parts[0], in_section)
if debug:
print("#", parts[0])
print("#", parts[0], '=>', in_section)
else:
if len(parts) > 2:
parts[2] = ' '.join(parts[2:])
Expand All @@ -347,7 +355,7 @@ def extract_sab(doc, debug=False):

if debug:
print(indent, sep='', end='')
print(('%s =' % parts[0]), *parts[1:], sep='\t')
print(in_section, '%s =' % parts[0], *parts[1:], sep='\t')

return flatten(thandler.get_results())

Expand Down
128 changes: 128 additions & 0 deletions scripts/make_precoordinated_sab_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# This create Turtle data with all found precoordinated terms in Libris, by
# processing a SPARQL result of all uses. It then parses the SAB codes, to
# check if they can be constructed from known "atoms" (which are loaded first).
# It also lists all unknown components on stderr.
from urllib.parse import quote, unquote
import csv
import gzip
import re
import sys
from textwrap import dedent

from sabcodeparser import parse_sab_code

qc = lambda c: quote(c, safe='')

sabterms = sys.argv[1]

sabcodes: set[str] = set()
sablangcodes: set[str] = set()

with open(sabterms) as f:
for l in f:
for slug in re.findall('^<([^>]+)>', l):
break
else:
continue

code = unquote(slug)

if '--' in code:
continue

if code.startswith('='):
assert 'LanguageSubdivision' in l
sablangcodes.add(code[1:])

sabcodes.add(code)

usagefpath = sys.argv[2]

compositebases: set[str] = set()

print("""\
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix : <https://id.kb.se/vocab/>
base <https://id.kb.se/term/kssb/>""")

with gzip.open(usagefpath, "rt") as f:
reader = csv.reader(f, 'excel-tab') # type: ignore

for i, row in enumerate(reader):
if i == 0 and row == ("cls", "count", "sample"):
continue

code, count, sample = row

# NOTE: skipping the long tail of usages less than...
if count.isnumeric() and int(count) < 20:
break

if code in sabcodes:
continue

unknown = []

first = None
cleancode = ""
parts = []

# NOTE: Don't precoordinate uses of MediaSubdivision
# (Already filtered in statistics query.)
#if re.search(r'/[A-Z]+', code):
# continue

for part in parse_sab_code(code):
if not first:
first = part

if part.startswith('z '):
continue

cleancode += part

if part.startswith('.'):
assert first
part = first[0] + part

if part not in sabcodes:
lit_transl_to_sv_code = 'Hce'
if part.startswith(lit_transl_to_sv_code):
tolang = part.removeprefix(lit_transl_to_sv_code)
if tolang in sablangcodes:
altcode = 'H' + tolang
if altcode in sabcodes and part not in compositebases:
altcode_sv = f"{altcode}=c"
print(dedent(f"""
<{qc(part)}> a :Classification ;
:exactMatch <{qc(altcode_sv)}> ;
:code "{part}" ; # "{altcode_sv}"^^:SABEduCode ;
:broader <Hce>, <{qc(altcode)}>, <{qc('=c')}> ;
:inScheme </term/kssb> ."""))
compositebases.add(part)
parts.append(part)
continue

unknown.append(part)
break
else:
parts.append(part)
else:
broader = ', '.join(f"<{qc(part)}>" for part in parts)
if 'z ' not in code and cleancode not in sabcodes:
altcode = code.replace(' ', '')
if cleancode != altcode:
sameas = f' owl:sameAs <{qc(altcode)}> ;'
altcode = f'; # "{code}"^^:SABAltCode '
else:
sameas = ''
altcode = ''

print(dedent(f"""
<{qc(cleancode)}> a :Classification ;{sameas}
:code "{cleancode}" {altcode};
:broader {broader} ;
:inScheme </term/kssb> ."""))

if unknown:
print(code, "|".join(unknown), count, f"<{sample}>", sep='\t', file=sys.stderr)
Loading