Skip to content

Commit

Permalink
Fixes #4 by adding backwards-compatible support for rdflib NamespaeMa…
Browse files Browse the repository at this point in the history
…nager.
  • Loading branch information
cadmiumkitty authored Jan 10, 2021
1 parent ded56b1 commit f50e8ea
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 53 deletions.
9 changes: 7 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,20 @@ Usage
Creating RDF from DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

As of version 1.1.0 NamespaceManager can be supplied to ``rdflib.to_graph`` for conversion to Graph.

::

import rdfpandas.graph
import pandas as pd
import rdflib
df = pd.read_csv('to_graph_test.csv', index_col = '@id', keep_default_na = False)
g = to_graph(df)
s = g.serialize(format='turtle')
namespace_manager = NamespaceManager(Graph())
namespace_manager.bind('skos', SKOS)
namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
g = to_graph(df, namespace_manager)
s = g.serialize(format = 'turtle')

Creating DataFrame from RDF
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
6 changes: 3 additions & 3 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
# -- Project information -----------------------------------------------------

project = 'rdfpandas'
copyright = '2020, Eugene Morozov'
copyright = '2021, Eugene Morozov'
author = 'Eugene Morozov'

# The short X.Y version
version = 'v1.0.0'
version = 'v1.1.0'
# The full version, including alpha/beta/rc tags
release = 'v1.0.0'
release = 'v1.1.0'


# -- General configuration ---------------------------------------------------
Expand Down
122 changes: 102 additions & 20 deletions rdfpandas/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from rdflib.namespace import NamespaceManager
import re

def to_graph(df: pd.DataFrame) -> Graph:
def to_graph(df: pd.DataFrame, namespace_manager: NamespaceManager = None) -> Graph:
"""
Takes Pandas DataFrame and returns RDFLib Graph.
Row indices are used as subjects and column indices as predicates.
Expand All @@ -21,6 +21,8 @@ def to_graph(df: pd.DataFrame) -> Graph:
----------
df : pandas.DataFrame
DataFrame to be converted into Graph.
namespace_manager : rdflib.namespace.NamespaceManager
NamespaceManager to use to normalize URIs
Returns
-------
Expand All @@ -29,19 +31,23 @@ def to_graph(df: pd.DataFrame) -> Graph:
"""

g = Graph()
g = Graph(namespace_manager = namespace_manager)

prefixes = {}
for (prefix, namespace) in g.namespace_manager.namespaces():
prefixes[prefix] = namespace

for (index, series) in df.iterrows():
for (column, value) in series.iteritems():
match = re.search('([\w?:/.]*)(\{(\w*)\})?(\[(\d*)\])?(\(([\w?:/.]*)\))?(@(\w*))?', column)

if pd.notna(value) and pd.notnull(value):
s = _get_identifier(index)
p = _get_identifier(match.group(1))
s = _get_identifier(prefixes, index)
p = _get_identifier(prefixes, match.group(1))
if isinstance(value, bytes):
o = _get_identifier(value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
o = _get_identifier(prefixes, value.decode('utf-8'), match.group(3), match.group(7), match.group(9))
else:
o = _get_identifier(value, match.group(3), match.group(7), match.group(9))
o = _get_identifier(prefixes, value, match.group(3), match.group(7), match.group(9))
g.add((s, p, o))

return g
Expand Down Expand Up @@ -110,22 +116,22 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
language = idl[2]
idl_len = predicates[p][idl]
for index in range(idl_len):
series_name = f'{_get_str_for_uri(g.namespace_manager, p)}{{{instance}}}'
series_name = f'{_get_str_for_uriref(g.namespace_manager, p)}{{{instance}}}'
if idl_len > 1:
series_name = ''.join([series_name, f'[{index}]'])
if datatype:
series_name = ''.join([series_name, f'({_get_str_for_uri(g.namespace_manager, datatype)})'])
series_name = ''.join([series_name, f'({_get_str_for_uriref(g.namespace_manager, datatype)})'])
if language:
series_name = ''.join([series_name, f'@{language}'])
p_subjects = []
p_objects = []
if idls_len == 1 and idl_len == 1:
for s, o in sorted(g.subject_objects(p)):
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
if isinstance(o, Literal):
p_objects.append(str(o))
else:
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
else:
s_index = 0
last_seen_subject = None
Expand All @@ -135,25 +141,27 @@ def to_dataframe(g: Graph) -> pd.DataFrame:
o_idl = _get_idl_for_identifier(o)
if o_idl == idl:
if s_index == index:
p_subjects.append(_get_str_for_uri(g.namespace_manager, s))
p_subjects.append(_get_str_for_uriref(g.namespace_manager, s))
if isinstance(o, Literal):
p_objects.append(str(o))
else:
p_objects.append(_get_str_for_uri(g.namespace_manager, o))
p_objects.append(_get_str_for_uriref(g.namespace_manager, o))
s_index = s_index + 1
last_seen_subject = s
series[series_name] = pd.Series(data = p_objects, index = p_subjects, dtype = np.unicode_)

return pd.DataFrame(series)

def _get_identifier(value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
def _get_identifier(prefixes: dict, value: object, instance: str = None, datatype: str = None, language: str = None) -> Identifier:
"""
Takes value extracted from the index, column or cell and returns
an instance of Identifier (Literal, URIRef or BNode) using correct
datatype and language.
Parameters
----------
prefixes : dict
Prefixes to use to normalize URIs
value : object
Value of index, column or cell
instance : str
Expand All @@ -176,19 +184,32 @@ def _get_identifier(value: object, instance: str = None, datatype: str = None, l
return Literal(value, lang = language)
elif datatype:
return Literal(value, datatype = URIRef(datatype))
elif re.match('^\w*:\w*$', str(value)) or re.match('^http[s]?://.*$', str(value)):
elif _is_uri(value):
return URIRef(value)
elif _is_curie(value):
return _get_uriref_for_curie(prefixes, value)
else:
return Literal(value)
elif instance == Literal.__name__:
if language:
return Literal(value, lang = language)
elif datatype:
return Literal(value, datatype = URIRef(datatype))
if _is_uri(datatype):
datatype_uriref = URIRef(datatype)
elif _is_curie(datatype):
datatype_uriref = _get_uriref_for_curie(prefixes, datatype)
else:
ValueError(f'Not a valid URI for datatype {datatype}')
return Literal(value, datatype = datatype_uriref)
else:
return Literal(value)
elif instance == URIRef.__name__:
return URIRef(value)
if _is_uri(value):
return URIRef(value)
elif _is_curie(value):
return _get_uriref_for_curie(prefixes, value)
else:
ValueError(f'Not a valid URI {value}')
elif instance == BNode.__name__:
return BNode(value)

Expand Down Expand Up @@ -220,7 +241,7 @@ def _get_idl_for_identifier(i: Identifier) -> tuple:

return (instance, datatype, language)

def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
def _get_str_for_uriref(namespace_manager: NamespaceManager, uriref: URIRef) -> str:
"""
Reusing NamespaceManager.normalizeUri for transforming Graph to DataFrame.
In effect we only need to strip < and > from N3 representation and
Expand All @@ -230,8 +251,8 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
----------
namespace_manager : rdflib.namespace.NamespaceManager
NamespaceManager to use to normalize URIs
uri : rdflib.URIRef
NamespaceManager to use to normalize URIs
uriref : rdflib.URIRef
URI to normalize
Returns
-------
Expand All @@ -240,4 +261,65 @@ def _get_str_for_uri(namespace_manager: NamespaceManager, uri: URIRef) -> str:
"""

return re.sub('<|>', '', namespace_manager.normalizeUri(uri))
return re.sub('<|>', '', namespace_manager.normalizeUri(uriref))

def _get_uriref_for_curie(prefixes: dict, value: object) -> URIRef:
"""
Converts curie string into URIRef with fully qualified URI.
Parameters
----------
prefixes : dict
Prefixes to use to normalize URIs
value : object
Value from DataFrame to be converted to URIRef.
Returns
-------
rdflib.URIRef
URIRef created from the string.
"""

prefix, name = value.split(':')
if prefix in prefixes:
return URIRef(''.join((prefixes[prefix], name)))
else:
return URIRef(value)

def _is_curie(value: object) -> bool:
"""
Checks if value from DataFrame is a CURIE.
Parameters
----------
value : object
Value from DataFrame to be checked.
Returns
-------
bool
True if value is matching CURIE pattern, false otherwise.
"""

return re.match('^\w*:\w*$', str(value))

def _is_uri(value: object) -> bool:
"""
Checks if value from DataFrame is a URI.
Parameters
----------
value : object
Value from DataFrame to be checked.
Returns
-------
bool
True if value is matching URI pattern, false otherwise.
"""

return re.match('^http[s]?://.*$', str(value))

19 changes: 10 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
license = f.read()

setup(
name='rdfpandas',
version='1.0.0',
description='RDF support for Pandas',
long_description=readme,
author='Eugene Morozov',
author_email='[email protected]',
url='https://github.com/cadmiumkitty/rdfpandas',
license='MIT',
packages=find_packages(exclude=('tests', 'docs'))
name = 'rdfpandas',
version = '1.1.0',
description = 'RDF support for Pandas',
long_description = readme,
author = 'Eugene Morozov',
author_email = '[email protected]',
url = 'https://github.com/cadmiumkitty/rdfpandas',
license = 'MIT',
packages = find_packages(exclude = ('tests', 'docs')),
install_requires = ['pandas>=1.2.0', 'rdflib>=5.0.0']
)

27 changes: 14 additions & 13 deletions tests/rdf/test.ttl
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
@prefix rdfpandas: <http://github.com/cadmiumkitty/rdfpandas/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<rdfpandas:one> <rdfpandas:curie> <skos:broader> ;
<rdfpandas:double> "10.0"^^<xsd:double> ;
<rdfpandas:integer> "10"^^<xsd:integer> ;
<rdfpandas:string> "String 1",
rdfpandas:one rdfpandas:curie skos:broader ;
rdfpandas:double "10.0"^^xsd:double ;
rdfpandas:integer "10"^^xsd:integer ;
rdfpandas:string "String 1",
"String in English 1 (1)"@en,
"String in English 2 (1)"@en,
"String in Nepali 1 (1)"@ne,
"String in Russian 1 (1)"@ru,
"String with type 1 (1)"^^<xsd:string>,
"String with type 2 (1)"^^<xsd:string> ;
<rdfpandas:uri> <https://google.com> .
"String with type 1 (1)"^^xsd:string,
"String with type 2 (1)"^^xsd:string ;
rdfpandas:uri <https://google.com> .

<rdfpandas:two> <rdfpandas:anotherstring> "String 2" ;
<rdfpandas:double> "20.0"^^<xsd:double> ;
<rdfpandas:integer> "20"^^<xsd:integer> ;
<rdfpandas:string> "String in English 1 (2)"@en,
rdfpandas:two rdfpandas:anotherstring "String 2" ;
rdfpandas:double "20.0"^^xsd:double ;
rdfpandas:integer "20"^^xsd:integer ;
rdfpandas:string "String in English 1 (2)"@en,
"String in Nepali 1 (2)"@ne,
"String in Nepali 2 (2)"@ne,
"String with type 1 (2)"^^<xsd:string>,
"String with type 2 (2)"^^<xsd:string> .
"String with type 1 (2)"^^xsd:string,
"String with type 2 (2)"^^xsd:string .
15 changes: 9 additions & 6 deletions tests/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import pandas as pd
import numpy as np

from rdflib import Graph, Literal, URIRef, BNode
from rdflib import Graph, Literal, URIRef, BNode, Namespace
from rdflib.term import Identifier
from rdflib.namespace import NamespaceManager
from rdflib.namespace import NamespaceManager, SKOS, XSD
import rdflib.compare

import unittest
Expand Down Expand Up @@ -156,7 +156,7 @@ def test_should_convert_data_frame_to_graph_literal(self):
Literal('String')))
g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
Literal('String with type only', datatype = URIRef('xsd:string'))))
Literal('String with type only', datatype = XSD.string)))
g_expected.add((URIRef('http://github.com/cadmiumkitty/rdfpandas/one'),
URIRef('http://github.com/cadmiumkitty/rdfpandas/string'),
Literal('String with language only in Nepali', lang = 'ne')))
Expand Down Expand Up @@ -334,7 +334,10 @@ def test_should_roundtrip_csv_to_graph_to_csv(self):
"""

df = pd.read_csv('./csv/test.csv', index_col = '@id', keep_default_na = True)
g = rdfpandas.to_graph(df)
namespace_manager = NamespaceManager(Graph())
namespace_manager.bind('skos', SKOS)
namespace_manager.bind('rdfpandas', Namespace('http://github.com/cadmiumkitty/rdfpandas/'))
g = rdfpandas.to_graph(df, namespace_manager)
df_result = rdfpandas.to_dataframe(g)

pd.testing.assert_frame_equal(df.astype(np.unicode_), df_result.astype(np.unicode_), check_like = True, check_names = False)
Expand All @@ -347,8 +350,8 @@ def test_should_roundtrip_graph_to_csv_to_graph(self):
g = rdflib.Graph()
g.parse('./rdf/test.ttl', format = 'ttl')
df = rdfpandas.to_dataframe(g)
g_result = rdfpandas.to_graph(df)

print(df.T)
g_result = rdfpandas.to_graph(df, g.namespace_manager)
self.assertEquals(rdflib.compare.isomorphic(g, g_result), True)


Expand Down

0 comments on commit f50e8ea

Please sign in to comment.