Skip to content

Commit

Permalink
Merge pull request #472 from thegetty/debug_basegraph
Browse files Browse the repository at this point in the history
RDF prefixes used in 'id' fields should not be treated as relative URIs
  • Loading branch information
benosteen authored Sep 23, 2024
2 parents 8a8540e + 33857e1 commit 8a77536
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 14 deletions.
7 changes: 7 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ SUBADDRESSING=True
# Can use GET param '?relativeid=true' to skip prefixing on request
PREFIX_RECORD_IDS=RECURSIVE

# Id prefixing will not prefix any RDF shorthand URIs (eg 'rdfs:label') in the `id`/`@id` property
# It does this by resolving the context, if present, and getting the list of prefixes to ignore
# NB this only affects prefixes used with a colon (eg 'rdf:type' will be skipped if 'rdf' is in the context but
# 'rdf/type' will be prefixed by the server's host and subpath as normal.)
# The prefixes for a given context will be cached for a number of seconds (default 12 hours as below):
CONTEXTPREFIX_TTL=43200

# RDF Context Cache
## LOD Gateway runs a cache for context documents, defaulting to 30 minutes in cache per context
RDF_CONTEXT_CACHE_EXPIRES=30
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ When ingesting records into the LOD Gateway, any top-level `"id"` properties in

For example, when ingesting the record for Vincent van Gogh's _Irises_ (1889) into an LOD Gateway instance deployed at `https://data.getty.edu/museum/collection`, the `"id"` property MUST have an `"id"` value with a relative URI of `"object/c88b3df0-de91-4f5b-a9ef-7b2b9a6d8abb"` resulting in the Gateway serving the record via the absolute URI of `https://data.getty.edu/museum/collection/object/c88b3df0-de91-4f5b-a9ef-7b2b9a6d8abb`. The Gateway will also insert the URL prefix into the `"id"` values before returning the response, converting any relative URIs in the document to absolute URIs that can be resolved by downstream systems.

If there is a JSON-LD context present, the id prefixing will not affect any valid RDF-prefixed `"id"` values. The system will process the context to find the configured list (eg `rdf`, `rdfs`, `crm`, etc), and will not affect any `id` values that use those.

The following code sample illustrates ingesting a record into an LOD Gateway instance, including how to supply the `Authorization` header, how to prepare the line-delimited `POST` body containing one or more serialized JSON/JSON-LD strings, and how if desired to submit multiple records as part of a single request:

```
Expand Down
12 changes: 12 additions & 0 deletions source/web-service/flaskapp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@ def create_app():
# RDFidPrefix should not end with a /
app.config["RDFidPrefix"][:-1]

# How long should the idprefixer keep cached lists of RDF prefixes from resolving
# contexts (default 12 hours)
app.config["CONTEXTPREFIX_TTL"] = 60 * 60 * 12
try:
app.config["CONTEXTPREFIX_TTL"] = int(
environ.get("CONTEXTPREFIX_TTL", app.config["CONTEXTPREFIX_TTL"])
)
except ValueError:
app.logger.error(
"The value in the 'CONTEXTPREFIX_TTL' environment key was not an integer duration of seconds. Setting to {app.config['CONTEXTPREFIX_TTL']}"
)

app.config["SQLALCHEMY_DATABASE_URI"] = environ["DATABASE"]
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False

Expand Down
49 changes: 45 additions & 4 deletions source/web-service/flaskapp/base_graph_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
from flaskapp.models import db
from flaskapp.models.record import Record
from datetime import datetime
from functools import wraps

from flask import current_app

# To parse out the base graph
from pyld import jsonld
from pyld.jsonld import set_document_loader

# docloader caching
import requests
from datetime import datetime, timedelta

from sqlalchemy.exc import ProgrammingError

from flaskapp.utilities import is_quads, quads_to_triples
from flaskapp.utilities import quads_to_triples, checksum_json
from flaskapp.storage_utilities.record import get_record, record_create

"""
Expand Down Expand Up @@ -131,3 +130,45 @@ def base_graph_filter(basegraphobj, fqdn_id):
"Failed to access record table - has the initial flask db upgrade been run?"
)
return set()


# Not limiting the context cache, as there are typically only a few in play
# in most services. Even a hundred would be fine.
_PREFIX_CACHE = {}


def cache_context_prefixes(f):
@wraps(f)
def wrapper(*args, **kwargs):
if data := args[0]:
strhash = checksum_json(data)
dnow = datetime.timestamp(datetime.now())
gap = dnow - current_app.config["CONTEXTPREFIX_TTL"]
# If this context has not been resolved yet, or
# if it has but the results are too old (older than the gap),
# regenerate the result
if (
strhash not in _PREFIX_CACHE
or _PREFIX_CACHE.get(strhash, [None, 0])[1] < gap
):
_PREFIX_CACHE[strhash] = (f(*args, **kwargs), dnow)

return _PREFIX_CACHE[strhash][0]
return {}

return wrapper


@cache_context_prefixes
def get_url_prefixes_from_context(context_json):
# Get the list of mapped prefixes (eg 'rdfs') from the context
proc = jsonld.JsonLdProcessor()
options = {
"isFrame": False,
"keepFreeFloatingNodes": False,
"documentLoader": current_app.config["RDF_DOCLOADER"],
"extractAllScripts": False,
"processingMode": "json-ld-1.1",
}
mappings = proc.process_context({"mappings": {}}, context_json, options)["mappings"]
return {x for x in mappings if mappings[x]["_prefix"] == True}
5 changes: 5 additions & 0 deletions source/web-service/flaskapp/routes/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
status_ok,
)
from flaskapp.utilities import checksum_json, authenticate_bearer
from flaskapp.base_graph_utils import get_url_prefixes_from_context

import time

Expand Down Expand Up @@ -434,13 +435,17 @@ def entity_record(entity_id):

# Assume that id/@id choice used in the data is the same as the top level
attr = "@id" if "@id" in data else "id"
urlprefixes = None
if "@context" in data:
urlprefixes = get_url_prefixes_from_context(data["@context"])

data = containerRecursiveCallback(
data=data,
attr=attr,
callback=idPrefixer,
prefix=idPrefix,
recursive=recursive,
urlprefixes=urlprefixes,
)

current_app.logger.debug(
Expand Down
27 changes: 20 additions & 7 deletions source/web-service/flaskapp/storage_utilities/graph.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
import requests
import json
import time
import re

from flask import current_app, request, abort, jsonify
from flask import current_app

from flaskapp.storage_utilities.record import get_record
from flaskapp.utilities import (
Event,
containerRecursiveCallback,
idPrefixer,
full_stack_trace,
is_quads,
quads_to_triples,
graph_filter,
)

import rdflib
import traceback

from pyld import jsonld
from pyld.jsonld import JsonLdError

from flaskapp.base_graph_utils import base_graph_filter
from flaskapp.base_graph_utils import base_graph_filter, get_url_prefixes_from_context
from flaskapp.graph_prefix_bindings import get_bound_graph


Expand All @@ -39,9 +36,18 @@ def __init__(

def inflate_relative_uris(data, id_attr="id"):
idPrefix = current_app.config["RDFidPrefix"]
urlprefixes = []

if "@context" in data:
# Get any context-added url prefixes:
urlprefixes = get_url_prefixes_from_context(data["@context"])

return containerRecursiveCallback(
data=data, attr=id_attr, callback=idPrefixer, prefix=idPrefix
data=data,
attr=id_attr,
callback=idPrefixer,
prefix=idPrefix,
urlprefixes=urlprefixes,
)


Expand Down Expand Up @@ -85,6 +91,8 @@ def graph_expand(data, proc=None):
tictoc = time.perf_counter()

# PyLD expansion? or RDFLIB?

current_app.logger.debug(f"Starting data to expand: '{data}'")
if current_app.config["USE_PYLD_REFORMAT"] is True:
current_app.logger.info(f"{json_ld_id} - expanding using PyLD")
try:
Expand Down Expand Up @@ -181,6 +189,11 @@ def graph_replace(graph_name, serialized_nt, update_endpoint):
current_app.logger.debug(
f"Filtering base triples ({len(current_app.config['RDF_FILTER_SET'])}) from graph n-triples"
)
current_app.logger.debug(
f"Incoming triples to filter: ("
+ str(serialized_nt.split("\n"))
+ ") from graph n-triples"
)
serialized_nt = graph_filter(
serialized_nt, current_app.config["RDF_FILTER_SET"]
)
Expand Down
12 changes: 9 additions & 3 deletions source/web-service/flaskapp/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def containerRecursiveCallback(
find=None,
replace=None,
prefix=None,
urlprefixes=None,
suffix=None,
callback=None,
recursive=True,
Expand Down Expand Up @@ -172,6 +173,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None)
prefix=prefix,
suffix=suffix,
callback=callback,
urlprefixes=urlprefixes,
)
else:
if (attr == None or attr == key) and isinstance(val, str):
Expand All @@ -182,6 +184,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None)
replace=replace,
prefix=prefix,
suffix=suffix,
urlprefixes=urlprefixes,
)

data[key] = val
Expand All @@ -199,6 +202,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None)
prefix=prefix,
suffix=suffix,
callback=callback,
urlprefixes=urlprefixes,
)
else:
if (attr == None or attr == key) and isinstance(val, str):
Expand All @@ -209,18 +213,20 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None)
replace=replace,
prefix=prefix,
suffix=suffix,
urlprefixes=urlprefixes,
)

data[key] = val

return data


def idPrefixer(attr, value, prefix=None, **kwargs):
def idPrefixer(attr, value, prefix=None, urlprefixes=None, **kwargs):
"""Helper callback method to prefix non-prefixed JSON-LD document 'id' attributes"""

if urlprefixes is None:
urlprefixes = set()
# prefix any relative uri with the prefix
if value.split(":")[0] not in ALLOWED_SCHEMES and prefix:
if value.split(":")[0] not in (ALLOWED_SCHEMES.union(urlprefixes)) and prefix:
return prefix + "/" + value

return value
Expand Down
36 changes: 36 additions & 0 deletions source/web-service/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,42 @@ def _sample_record():
return _sample_record


@pytest.fixture
def sample_idprefixdata(sample_rdfrecord_with_context):
return sample_rdfrecord_with_context()


@pytest.fixture
def sample_rdfrecord_with_context(test_db):
def _sample_record():
record = Record(
entity_id=str(uuid4()),
entity_type="Object",
datetime_created=datetime(2020, 11, 22, 13, 2, 53),
datetime_updated=datetime(2020, 12, 18, 11, 22, 7),
data={
"@context": {
"dc": "http://purl.org/dc/elements/1.1/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"_label": {"@id": "http://www.w3.org/2000/01/rdf-schema#label"},
"_prefixedlabel": {"@id": "rdfs:label"},
},
"@id": "rdfsample1",
"rdfs:seeAlso": [
{
"_label": "This is a meaningless bit of data to test if the idprefixer leaves the id alone",
"@id": "dc:description",
},
],
},
)
test_db.session.add(record)
test_db.session.commit()
return record

return _sample_record


@pytest.fixture
def linguisticobject():
def _generator(name, id):
Expand Down
18 changes: 18 additions & 0 deletions source/web-service/tests/test_routes_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,24 @@ def test_prefix_record_ids_top(
assert "LOD Gateway" in response.headers["Server"]
assert json.loads(response.data) == data

def test_prefix_record_w_rdf_prefixes(
self, sample_idprefixdata, client, namespace, current_app
):
current_app.config["PREFIX_RECORD_IDS"] = "RECURSIVE"

response = client.get(f"/{namespace}/{sample_idprefixdata.entity_id}")

assert response.status_code == 200

data = response.get_json()

# make sure the main relative id has been prefixed
assert not data["@id"].startswith("rdfsample1")

# make sure that the RDF prefixed IDs have been left alone.
assert data["rdfs:seeAlso"][0]["@id"] == "dc:description"
assert data["@context"]["_prefixedlabel"]["@id"] == "rdfs:label"

def test_prefix_record_ids_none(
self, sample_data_with_ids, client, namespace, current_app
):
Expand Down

0 comments on commit 8a77536

Please sign in to comment.