-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
✨ Create data loader for XML
- Loading branch information
Showing
11 changed files
with
354 additions
and
121 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "streams.py" | ||
version = "1.1.0" | ||
version = "1.2.0" | ||
authors = ["Stefan Garlonta <[email protected]>"] | ||
description = "A stream library for Python inspired by Java Stream API" | ||
keywords = ["streams", "parallel", "data"] | ||
|
@@ -15,6 +15,11 @@ packages = [ | |
[tool.poetry.dependencies] | ||
python = ">=3.7,<4.0" | ||
joblib = ">=1.2,<1.4" | ||
defusedxml = { version = ">=0.7,<0.8", optional = true } | ||
|
||
[tool.poetry.extras] | ||
xml_loader = ["defusedxml"] | ||
all = ["defusedxml"] | ||
|
||
[tool.poetry.group.test.dependencies] | ||
parameterized = "*" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from pystreamapi.__stream import Stream | ||
from pystreamapi._streams.error.__levels import ErrorLevel | ||
|
||
__version__ = "1.1.0" | ||
__version__ = "1.2.0" | ||
__all__ = ["Stream", "ErrorLevel"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
from pystreamapi.loaders.__csv.__csv_loader import csv | ||
from pystreamapi.loaders.__json.__json_loader import json | ||
from pystreamapi.loaders.__xml.__xml_loader import xml | ||
|
||
__all__ = [ | ||
'csv', | ||
'json' | ||
'json', | ||
'xml' | ||
] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
try: | ||
from defusedxml import ElementTree | ||
except ImportError as exc: | ||
raise ImportError( | ||
"Please install the xml_loader extra dependency to use the xml loader." | ||
) from exc | ||
from collections import namedtuple | ||
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable | ||
from pystreamapi.loaders.__loader_utils import LoaderUtils | ||
|
||
|
||
class __XmlLoaderUtil: | ||
"""Utility class for the XML loader.""" | ||
|
||
def __init__(self): | ||
self.cast_types = True | ||
self.retrieve_children = True | ||
|
||
|
||
config = __XmlLoaderUtil() | ||
|
||
|
||
def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True, | ||
encoding="utf-8") -> LazyFileIterable: | ||
""" | ||
Loads XML data from either a path or a string and converts it into a list of namedtuples. | ||
Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources | ||
you trust. | ||
Returns: | ||
LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element. | ||
:param retrieve_children: If true, the children of the root element are used as stream | ||
elements. | ||
:param encoding: The encoding of the XML file. | ||
:param src: Either the path to an XML file or an XML string. | ||
:param read_from_src: If True, src is treated as an XML string. If False, src is treated as | ||
a path to an XML file. | ||
:param cast_types: Set as False to disable casting of values to int, bool or float. | ||
""" | ||
config.cast_types = cast_types | ||
config.retrieve_children = retrieve_children | ||
if read_from_src: | ||
return LazyFileIterable(lambda: __load_xml_string(src)) | ||
path = LoaderUtils.validate_path(src) | ||
return LazyFileIterable(lambda: __load_xml_file(path, encoding)) | ||
|
||
|
||
def __load_xml_file(file_path, encoding): | ||
"""Load an XML file and convert it into a list of namedtuples.""" | ||
# skipcq: PTC-W6004 | ||
with open(file_path, mode='r', encoding=encoding) as xmlfile: | ||
src = xmlfile.read() | ||
if src: | ||
return __parse_xml_string(src) | ||
return [] | ||
|
||
|
||
def __load_xml_string(xml_string): | ||
"""Load XML data from a string and convert it into a list of namedtuples.""" | ||
return __parse_xml_string(xml_string) | ||
|
||
|
||
def __parse_xml_string(xml_string): | ||
"""Parse XML string and convert it into a list of namedtuples.""" | ||
root = ElementTree.fromstring(xml_string) | ||
parsed_xml = __parse_xml(root) | ||
return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml] | ||
|
||
|
||
def __parse_xml(element): | ||
"""Parse XML element and convert it into a namedtuple.""" | ||
if len(element) == 0: | ||
return __parse_empty_element(element) | ||
if len(element) == 1: | ||
return __parse_single_element(element) | ||
return __parse_multiple_elements(element) | ||
|
||
|
||
def __parse_empty_element(element): | ||
"""Parse XML element without children and convert it into a namedtuple.""" | ||
return LoaderUtils.try_cast(element.text) if config.cast_types else element.text | ||
|
||
|
||
def __parse_single_element(element): | ||
"""Parse XML element with a single child and convert it into a namedtuple.""" | ||
sub_element = element[0] | ||
sub_item = __parse_xml(sub_element) | ||
Item = namedtuple(element.tag, [sub_element.tag]) | ||
return Item(sub_item) | ||
|
||
|
||
def __parse_multiple_elements(element): | ||
"""Parse XML element with multiple children and convert it into a namedtuple.""" | ||
tag_dict = {} | ||
for e in element: | ||
if e.tag not in tag_dict: | ||
tag_dict[e.tag] = [] | ||
tag_dict[e.tag].append(__parse_xml(e)) | ||
filtered_dict = __filter_single_items(tag_dict) | ||
Item = namedtuple(element.tag, filtered_dict.keys()) | ||
return Item(*filtered_dict.values()) | ||
|
||
|
||
def __filter_single_items(tag_dict): | ||
"""Filter out single-item lists from a dictionary.""" | ||
return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()} | ||
|
||
|
||
def __flatten(data): | ||
"""Flatten a list of lists.""" | ||
res = [] | ||
for item in data: | ||
if isinstance(item, list): | ||
res.extend(item) | ||
else: | ||
res.append(item) | ||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# pylint: disable=not-context-manager | ||
from unittest import TestCase | ||
from unittest.mock import patch, mock_open | ||
from xml.etree.ElementTree import ParseError | ||
|
||
from file_test import OPEN, PATH_EXISTS, PATH_ISFILE | ||
from pystreamapi.loaders import xml | ||
|
||
file_content = """ | ||
<employees> | ||
<employee> | ||
<name>John Doe</name> | ||
<salary>80000</salary> | ||
</employee> | ||
<employee> | ||
<name>Alice Smith</name> | ||
<child> | ||
<name>Frank</name> | ||
</child> | ||
</employee> | ||
<founder> | ||
<cars> | ||
<car>Bugatti</car> | ||
<car>Mercedes</car> | ||
</cars> | ||
</founder> | ||
</employees> | ||
""" | ||
file_path = 'path/to/data.xml' | ||
|
||
|
||
class TestXmlLoader(TestCase): | ||
|
||
def test_xml_loader_from_file_children(self): | ||
with (patch(OPEN, mock_open(read_data=file_content)), | ||
patch(PATH_EXISTS, return_value=True), | ||
patch(PATH_ISFILE, return_value=True)): | ||
data = xml(file_path) | ||
self.assertEqual(len(data), 3) | ||
self.assertEqual(data[0].salary, 80000) | ||
self.assertIsInstance(data[0].salary, int) | ||
self.assertEqual(data[1].child.name, "Frank") | ||
self.assertIsInstance(data[1].child.name, str) | ||
self.assertEqual(data[2].cars.car[0], 'Bugatti') | ||
self.assertIsInstance(data[2].cars.car[0], str) | ||
|
||
def test_xml_loader_from_file_no_children_false(self): | ||
with (patch(OPEN, mock_open(read_data=file_content)), | ||
patch(PATH_EXISTS, return_value=True), | ||
patch(PATH_ISFILE, return_value=True)): | ||
data = xml(file_path, retrieve_children=False) | ||
self.assertEqual(len(data), 1) | ||
self.assertEqual(data[0].employee[0].salary, 80000) | ||
self.assertIsInstance(data[0].employee[0].salary, int) | ||
self.assertEqual(data[0].employee[1].child.name, "Frank") | ||
self.assertIsInstance(data[0].employee[1].child.name, str) | ||
self.assertEqual(data[0].founder.cars.car[0], 'Bugatti') | ||
self.assertIsInstance(data[0].founder.cars.car[0], str) | ||
|
||
def test_xml_loader_no_casting(self): | ||
with (patch(OPEN, mock_open(read_data=file_content)), | ||
patch(PATH_EXISTS, return_value=True), | ||
patch(PATH_ISFILE, return_value=True)): | ||
data = xml(file_path, cast_types=False) | ||
self.assertEqual(len(data), 3) | ||
self.assertEqual(data[0].salary, '80000') | ||
self.assertIsInstance(data[0].salary, str) | ||
self.assertEqual(data[1].child.name, "Frank") | ||
self.assertIsInstance(data[1].child.name, str) | ||
self.assertEqual(data[2].cars.car[0], 'Bugatti') | ||
self.assertIsInstance(data[2].cars.car[0], str) | ||
|
||
def test_xml_loader_is_iterable(self): | ||
with (patch(OPEN, mock_open(read_data=file_content)), | ||
patch(PATH_EXISTS, return_value=True), | ||
patch(PATH_ISFILE, return_value=True)): | ||
data = xml(file_path) | ||
self.assertEqual(len(list(iter(data))), 3) | ||
|
||
def test_xml_loader_with_empty_file(self): | ||
with (patch(OPEN, mock_open(read_data="")), | ||
patch(PATH_EXISTS, return_value=True), | ||
patch(PATH_ISFILE, return_value=True)): | ||
data = xml(file_path) | ||
self.assertEqual(len(data), 0) | ||
|
||
def test_xml_loader_with_invalid_path(self): | ||
with self.assertRaises(FileNotFoundError): | ||
xml('path/to/invalid.xml') | ||
|
||
def test_xml_loader_with_no_file(self): | ||
with self.assertRaises(ValueError): | ||
xml('./') | ||
|
||
def test_xml_loader_from_string(self): | ||
data = xml(file_content, read_from_src=True) | ||
self.assertEqual(len(data), 3) | ||
self.assertEqual(data[0].salary, 80000) | ||
self.assertIsInstance(data[0].salary, int) | ||
self.assertEqual(data[1].child.name, "Frank") | ||
self.assertIsInstance(data[1].child.name, str) | ||
self.assertEqual(data[2].cars.car[0], 'Bugatti') | ||
self.assertIsInstance(data[2].cars.car[0], str) | ||
|
||
def test_xml_loader_from_empty_string(self): | ||
with self.assertRaises(ParseError): | ||
len(xml('', read_from_src=True)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters