improve documentation

CentreForDigitalHumanities · Jul 10, 2024 · 2a86f26 · 2a86f26
1 parent 7737f31
commit 2a86f26
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 19 deletions.
diff --git a/docs/api.md b/docs/api.md
@@ -30,6 +30,12 @@ __Module:__ `ianalyzer_readers.readers.html`
 
 ::: ianalyzer_readers.readers.html
 
+## RDF reader
+
+__Module:__ `ianalyzer_readers.readers.rdf`
+
+::: ianalyzer_readers.readers.rdf
+
 ## Extractors
 
 __Module:__ `ianalyzer_readers.extract`

diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -595,18 +595,21 @@ def _apply(self, metadata, *nargs, **kwargs):
 
 class RDF(Extractor):
     ''' An extractor to extract data from RDF triples
+
     Parameters:
         predicates: 
-            an iteratble of predicates with which to query for triples of interest
+            an iteratble of predicates (i.e., the middle part of a RDF triple) with which to query for objects
         node_type:
-            either the subject or object of the triple is returned
-            default: 'object'; returning the subject instead can be useful, e.g., for identifiers
+            if 'subject': return the subject (effectively a no-op), useful for extracting identifiers or urls
+            
+            if 'object': return value(s) from objects occurring in triples with the current subject / predicate combination
         multiple: 
-            if `True`, return a list of all nodes for which the query returns a result,
-            if `False`, return the first node matching a query
+            if `True`: return a list of all nodes for which the query returns a result,
+            
+            if `False`: return the first node matching a query
         is_collection:
             specify whether the data of interest is a collection, i.e., sequential data
-            this is specified by the predicates `rdf:first` and `rdf:rest`, see https://rdflib.readthedocs.io/en/stable/_modules/rdflib/collection.html
+            a collection is indicated by the predicates `rdf:first` and `rdf:rest`, see [rdflib documentation](https://rdflib.readthedocs.io/en/stable/_modules/rdflib/collection.html)
 
     '''
 
@@ -619,30 +622,34 @@ def __init__(self, *predicates: Iterable[URIRef], node_type: str = 'object', mul
 
     def _apply(self, graph: Graph = None, subject: BNode = None, *nargs, **kwargs) -> Union[str, List[str]]:
         ''' apply a query to the RDFReader's graph, with one subject resulting from the `document_subjects` function
+        
         Parameters:
             graph: a graph in which to query (set on RDFReader)
             subject: the subject with which to query
+        
         Returns:
             a string or list of strings
         '''
         if self.node_type == 'subject':
-            return self.get_node_value(subject)
+            return self._get_node_value(subject)
         if self.is_collection:
             collection = Collection(graph, subject)
-            return [self.get_node_value(node) for node in list(collection)]
+            return [self._get_node_value(node) for node in list(collection)]
         nodes = self._select(graph, subject, self.predicates)
         if self.multiple:
-            return [self.get_node_value(node) for node in nodes]
-        return self.get_node_value(nodes[0])
+            return [self._get_node_value(node) for node in nodes]
+        return self._get_node_value(nodes[0])
 
     def _select(self, graph, subject, predicates: Iterable[URIRef]) -> List[Union[Literal, URIRef, BNode]]:
         ''' search in a graph with predicates
             if more than one predicate is passed, this is a recursive query:
             the first search result of the query is used as a subject in the next query
+            
             Parameters:
                 subject: the subject with which to query
                 graph: the graph to search
                 predicates: a list of predicates with which to query
+            
             Returns:
                 a list of nodes matching the query
         '''
@@ -652,7 +659,7 @@ def _select(self, graph, subject, predicates: Iterable[URIRef]) -> List[Union[Li
         else:
             return nodes
 
-    def get_node_value(self, node):
+    def _get_node_value(self, node):
         ''' return a string value extracted from the node '''
         if type(node) == Literal:
             return node.value

diff --git a/ianalyzer_readers/readers/rdf.py b/ianalyzer_readers/readers/rdf.py
@@ -1,6 +1,12 @@
+'''
+This module defines a Resource Description Framework (RDF) reader.
+
+Extraction is based on the [rdflib library](https://rdflib.readthedocs.io/en/stable/index.html).
+'''
+
 from typing import Iterable, Union
 
-from rdflib import BNode, Graph, URIRef
+from rdflib import BNode, Graph, Literal, URIRef
 
 from .core import Reader, Document, Source
 
@@ -9,7 +15,7 @@ class RDFReader(Reader):
     '''
     A base class for Readers of Resource Description Framework files.
     These could be in Turtle, JSON-LD, RDFXML or other formats,
-    see [rdflib parsers](https://rdflib.readthedocs.io/en/stable/plugin_parsers.html)
+    see [rdflib parsers](https://rdflib.readthedocs.io/en/stable/plugin_parsers.html).
     '''
 
     def source2dicts(self, source: Source) -> Iterable[Document]:
@@ -31,16 +37,18 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
         for subject in document_subjects:
             yield self._document_from_subject(g, subject)
 
-    def document_subjects(self, graph: Graph) -> list:
-        ''' override this function such that each subject can be used to
-        retrieve a separate document in the resulting index
+    def document_subjects(self, graph: Graph) -> Iterable[Union[BNode, Literal, URIRef]]:
+        ''' Override this function to return all subjects (i.e., first part of RDF triple) 
+        with which to search for data in the RDF graph.
+        Typically, such subjects are identifiers or urls.
         
         Parameters:
             graph: the graph to parse
+        
         Returns:
-            list of nodes
+            generator or list of nodes
         '''
-        return list(graph.subjects())
+        return graph.subjects()
 
-    def _document_from_subject(self, graph: Graph, subject: Union[BNode, URIRef]) -> dict:
+    def _document_from_subject(self, graph: Graph, subject: Union[BNode, Literal, URIRef]) -> dict:
         return {field.name: field.extractor.apply(graph=graph, subject=subject) for field in self.fields}