Skip to content

Commit

Permalink
improve documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Jul 10, 2024
1 parent 7737f31 commit 2a86f26
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 19 deletions.
6 changes: 6 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ __Module:__ `ianalyzer_readers.readers.html`

::: ianalyzer_readers.readers.html

## RDF reader

__Module:__ `ianalyzer_readers.readers.rdf`

::: ianalyzer_readers.readers.rdf

## Extractors

__Module:__ `ianalyzer_readers.extract`
Expand Down
29 changes: 18 additions & 11 deletions ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,18 +595,21 @@ def _apply(self, metadata, *nargs, **kwargs):

class RDF(Extractor):
''' An extractor to extract data from RDF triples
Parameters:
predicates:
an iteratble of predicates with which to query for triples of interest
an iteratble of predicates (i.e., the middle part of a RDF triple) with which to query for objects
node_type:
either the subject or object of the triple is returned
default: 'object'; returning the subject instead can be useful, e.g., for identifiers
if 'subject': return the subject (effectively a no-op), useful for extracting identifiers or urls
if 'object': return value(s) from objects occurring in triples with the current subject / predicate combination
multiple:
if `True`, return a list of all nodes for which the query returns a result,
if `False`, return the first node matching a query
if `True`: return a list of all nodes for which the query returns a result,
if `False`: return the first node matching a query
is_collection:
specify whether the data of interest is a collection, i.e., sequential data
this is specified by the predicates `rdf:first` and `rdf:rest`, see https://rdflib.readthedocs.io/en/stable/_modules/rdflib/collection.html
a collection is indicated by the predicates `rdf:first` and `rdf:rest`, see [rdflib documentation](https://rdflib.readthedocs.io/en/stable/_modules/rdflib/collection.html)
'''

Expand All @@ -619,30 +622,34 @@ def __init__(self, *predicates: Iterable[URIRef], node_type: str = 'object', mul

def _apply(self, graph: Graph = None, subject: BNode = None, *nargs, **kwargs) -> Union[str, List[str]]:
''' apply a query to the RDFReader's graph, with one subject resulting from the `document_subjects` function
Parameters:
graph: a graph in which to query (set on RDFReader)
subject: the subject with which to query
Returns:
a string or list of strings
'''
if self.node_type == 'subject':
return self.get_node_value(subject)
return self._get_node_value(subject)
if self.is_collection:
collection = Collection(graph, subject)
return [self.get_node_value(node) for node in list(collection)]
return [self._get_node_value(node) for node in list(collection)]
nodes = self._select(graph, subject, self.predicates)
if self.multiple:
return [self.get_node_value(node) for node in nodes]
return self.get_node_value(nodes[0])
return [self._get_node_value(node) for node in nodes]
return self._get_node_value(nodes[0])

def _select(self, graph, subject, predicates: Iterable[URIRef]) -> List[Union[Literal, URIRef, BNode]]:
''' search in a graph with predicates
if more than one predicate is passed, this is a recursive query:
the first search result of the query is used as a subject in the next query
Parameters:
subject: the subject with which to query
graph: the graph to search
predicates: a list of predicates with which to query
Returns:
a list of nodes matching the query
'''
Expand All @@ -652,7 +659,7 @@ def _select(self, graph, subject, predicates: Iterable[URIRef]) -> List[Union[Li
else:
return nodes

def get_node_value(self, node):
def _get_node_value(self, node):
''' return a string value extracted from the node '''
if type(node) == Literal:
return node.value
Expand Down
24 changes: 16 additions & 8 deletions ianalyzer_readers/readers/rdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
'''
This module defines a Resource Description Framework (RDF) reader.
Extraction is based on the [rdflib library](https://rdflib.readthedocs.io/en/stable/index.html).
'''

from typing import Iterable, Union

from rdflib import BNode, Graph, URIRef
from rdflib import BNode, Graph, Literal, URIRef

from .core import Reader, Document, Source

Expand All @@ -9,7 +15,7 @@ class RDFReader(Reader):
'''
A base class for Readers of Resource Description Framework files.
These could be in Turtle, JSON-LD, RDFXML or other formats,
see [rdflib parsers](https://rdflib.readthedocs.io/en/stable/plugin_parsers.html)
see [rdflib parsers](https://rdflib.readthedocs.io/en/stable/plugin_parsers.html).
'''

def source2dicts(self, source: Source) -> Iterable[Document]:
Expand All @@ -31,16 +37,18 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
for subject in document_subjects:
yield self._document_from_subject(g, subject)

def document_subjects(self, graph: Graph) -> list:
''' override this function such that each subject can be used to
retrieve a separate document in the resulting index
def document_subjects(self, graph: Graph) -> Iterable[Union[BNode, Literal, URIRef]]:
''' Override this function to return all subjects (i.e., first part of RDF triple)
with which to search for data in the RDF graph.
Typically, such subjects are identifiers or urls.
Parameters:
graph: the graph to parse
Returns:
list of nodes
generator or list of nodes
'''
return list(graph.subjects())
return graph.subjects()

def _document_from_subject(self, graph: Graph, subject: Union[BNode, URIRef]) -> dict:
def _document_from_subject(self, graph: Graph, subject: Union[BNode, Literal, URIRef]) -> dict:
return {field.name: field.extractor.apply(graph=graph, subject=subject) for field in self.fields}

0 comments on commit 2a86f26

Please sign in to comment.