Merge pull request #82 from PickwickSoft/feature/#72/data-loader-for-xml

✨ Create data loader for XML
pickwicksoft · Dec 30, 2023 · 92bda09 · 92bda09
2 parents 198d9c2 + 9e76342
commit 92bda09
Show file tree

Hide file tree

Showing 11 changed files with 354 additions and 121 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -64,14 +64,14 @@ jobs:
       # Install dependencies. `--no-root` means "install all dependencies but not the project
       # itself", which is what you want to avoid caching _your_ code. The `if` statement
       # ensures this only runs on a cache miss.
-      - run: poetry install --no-root
+      - run: poetry install --no-root --extras "all"
         if: steps.cache-deps.outputs.cache-hit != 'true'
 
       # Now install _your_ project. This isn't necessary for many types of projects -- particularly
       # things like Django apps don't need this. But it's a good idea since it fully-exercises the
       # pyproject.toml and makes that if you add things like console-scripts at some point that
       # they'll be installed and working.
-      - run: poetry install
+      - run: poetry install --extras "all"
 
       # Runs a single command using the runners shell
       - name: Run Unittests

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Now you might be wondering why another library when there are already a few impl
 * The implementation achieves 100% test coverage.
 * It follows Pythonic principles, resulting in clean and readable code.
 * It adds some cool innovative features such as conditions or error handling and an even more declarative look.
-* It provides loaders for various data sources such as CSV
+* It provides loaders for various data sources such as CSV, JSON and XML files.
 
 Let's take a look at a small example:
 
@@ -213,14 +213,15 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))
 
 Creates a new Stream from multiple Streams. Order doesn't change.
 
-## Use loaders: Load data from CSV and JSON files in just one line
+## Use loaders: Load data from CSV, JSON and XML files in just one line
 
-PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.
+PyStreamAPI offers a convenient way to load data from CSV, JSON and XML files. Like that you can start processing your
+files right away without having to worry about reading and parsing the files.
 
 You can import the loaders with:
 
 ```python
-from pystreamapi.loaders import csv, json
+from pystreamapi.loaders import csv, json, xml
 ```
 Now you can use the loaders directly when creating your Stream:
 
@@ -241,6 +242,25 @@ Stream.of(json("data.json")) \
 
 You can access the attributes of the data structures directly like you would do with a normal object.
 
+For XML:
+
+In order to use the XML loader, you need to install the optional xml dependency:
+
+```bash
+pip install streams.py[xml_loader]
+```
+
+Afterward, you can use the XML loader like this:
+
+```python
+Stream.of(xml("data.xml"))
+  .map(lambda x: x.attr1)
+  .for_each(print)
+```
+
+The access to the attributes is using a node path syntax. For more details on how to use the node path syntax, please
+refer to the [documentation](https://pystreamapi.pickwicksoft.org/reference/data-loaders).
+
 ## API Reference
 For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "streams.py"
-version = "1.1.0"
+version = "1.2.0"
 authors = ["Stefan Garlonta <[email protected]>"]
 description = "A stream library for Python inspired by Java Stream API"
 keywords = ["streams", "parallel", "data"]
@@ -15,6 +15,11 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.7,<4.0"
 joblib = ">=1.2,<1.4"
+defusedxml = { version = ">=0.7,<0.8", optional = true }
+
+[tool.poetry.extras]
+xml_loader = ["defusedxml"]
+all = ["defusedxml"]
 
 [tool.poetry.group.test.dependencies]
 parameterized = "*"

diff --git a/pystreamapi/__init__.py b/pystreamapi/__init__.py
@@ -1,5 +1,5 @@
 from pystreamapi.__stream import Stream
 from pystreamapi._streams.error.__levels import ErrorLevel
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
 __all__ = ["Stream", "ErrorLevel"]
diff --git a/pystreamapi/loaders/__init__.py b/pystreamapi/loaders/__init__.py
@@ -1,7 +1,9 @@
 from pystreamapi.loaders.__csv.__csv_loader import csv
 from pystreamapi.loaders.__json.__json_loader import json
+from pystreamapi.loaders.__xml.__xml_loader import xml
 
 __all__ = [
     'csv',
-    'json'
+    'json',
+    'xml'
 ]
diff --git a/pystreamapi/loaders/__xml/__init__.py b/pystreamapi/loaders/__xml/__init__.py
diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py
@@ -0,0 +1,117 @@
+try:
+    from defusedxml import ElementTree
+except ImportError as exc:
+    raise ImportError(
+        "Please install the xml_loader extra dependency to use the xml loader."
+    ) from exc
+from collections import namedtuple
+from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
+from pystreamapi.loaders.__loader_utils import LoaderUtils
+
+
+class __XmlLoaderUtil:
+    """Utility class for the XML loader."""
+
+    def __init__(self):
+        self.cast_types = True
+        self.retrieve_children = True
+
+
+config = __XmlLoaderUtil()
+
+
+def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
+        encoding="utf-8") -> LazyFileIterable:
+    """
+    Loads XML data from either a path or a string and converts it into a list of namedtuples.
+    Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
+    you trust.
+
+    Returns:
+        LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
+        :param retrieve_children: If true, the children of the root element are used as stream
+        elements.
+        :param encoding: The encoding of the XML file.
+        :param src: Either the path to an XML file or an XML string.
+        :param read_from_src: If True, src is treated as an XML string. If False, src is treated as
+            a path to an XML file.
+        :param cast_types: Set as False to disable casting of values to int, bool or float.
+    """
+    config.cast_types = cast_types
+    config.retrieve_children = retrieve_children
+    if read_from_src:
+        return LazyFileIterable(lambda: __load_xml_string(src))
+    path = LoaderUtils.validate_path(src)
+    return LazyFileIterable(lambda: __load_xml_file(path, encoding))
+
+
+def __load_xml_file(file_path, encoding):
+    """Load an XML file and convert it into a list of namedtuples."""
+    # skipcq: PTC-W6004
+    with open(file_path, mode='r', encoding=encoding) as xmlfile:
+        src = xmlfile.read()
+        if src:
+            return __parse_xml_string(src)
+    return []
+
+
+def __load_xml_string(xml_string):
+    """Load XML data from a string and convert it into a list of namedtuples."""
+    return __parse_xml_string(xml_string)
+
+
+def __parse_xml_string(xml_string):
+    """Parse XML string and convert it into a list of namedtuples."""
+    root = ElementTree.fromstring(xml_string)
+    parsed_xml = __parse_xml(root)
+    return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]
+
+
+def __parse_xml(element):
+    """Parse XML element and convert it into a namedtuple."""
+    if len(element) == 0:
+        return __parse_empty_element(element)
+    if len(element) == 1:
+        return __parse_single_element(element)
+    return __parse_multiple_elements(element)
+
+
+def __parse_empty_element(element):
+    """Parse XML element without children and convert it into a namedtuple."""
+    return LoaderUtils.try_cast(element.text) if config.cast_types else element.text
+
+
+def __parse_single_element(element):
+    """Parse XML element with a single child and convert it into a namedtuple."""
+    sub_element = element[0]
+    sub_item = __parse_xml(sub_element)
+    Item = namedtuple(element.tag, [sub_element.tag])
+    return Item(sub_item)
+
+
+def __parse_multiple_elements(element):
+    """Parse XML element with multiple children and convert it into a namedtuple."""
+    tag_dict = {}
+    for e in element:
+        if e.tag not in tag_dict:
+            tag_dict[e.tag] = []
+        tag_dict[e.tag].append(__parse_xml(e))
+    filtered_dict = __filter_single_items(tag_dict)
+    Item = namedtuple(element.tag, filtered_dict.keys())
+    return Item(*filtered_dict.values())
+
+
+def __filter_single_items(tag_dict):
+    """Filter out single-item lists from a dictionary."""
+    return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()}
+
+
+def __flatten(data):
+    """Flatten a list of lists."""
+    res = []
+    for item in data:
+        if isinstance(item, list):
+            res.extend(item)
+        else:
+            res.append(item)
+    return res
diff --git a/setup.cfg b/setup.cfg
diff --git a/tests/test_xml_loader.py b/tests/test_xml_loader.py
@@ -0,0 +1,107 @@
+# pylint: disable=not-context-manager
+from unittest import TestCase
+from unittest.mock import patch, mock_open
+from xml.etree.ElementTree import ParseError
+
+from file_test import OPEN, PATH_EXISTS, PATH_ISFILE
+from pystreamapi.loaders import xml
+
+file_content = """
+<employees>
+    <employee>
+        <name>John Doe</name>
+        <salary>80000</salary>
+    </employee>
+    <employee>
+        <name>Alice Smith</name>
+        <child>
+            <name>Frank</name>
+        </child>
+    </employee>
+    <founder>
+        <cars>
+            <car>Bugatti</car>
+            <car>Mercedes</car>
+        </cars>
+    </founder>
+</employees>
+"""
+file_path = 'path/to/data.xml'
+
+
+class TestXmlLoader(TestCase):
+
+    def test_xml_loader_from_file_children(self):
+        with (patch(OPEN, mock_open(read_data=file_content)),
+              patch(PATH_EXISTS, return_value=True),
+              patch(PATH_ISFILE, return_value=True)):
+            data = xml(file_path)
+            self.assertEqual(len(data), 3)
+            self.assertEqual(data[0].salary, 80000)
+            self.assertIsInstance(data[0].salary, int)
+            self.assertEqual(data[1].child.name, "Frank")
+            self.assertIsInstance(data[1].child.name, str)
+            self.assertEqual(data[2].cars.car[0], 'Bugatti')
+            self.assertIsInstance(data[2].cars.car[0], str)
+
+    def test_xml_loader_from_file_no_children_false(self):
+        with (patch(OPEN, mock_open(read_data=file_content)),
+              patch(PATH_EXISTS, return_value=True),
+              patch(PATH_ISFILE, return_value=True)):
+            data = xml(file_path, retrieve_children=False)
+            self.assertEqual(len(data), 1)
+            self.assertEqual(data[0].employee[0].salary, 80000)
+            self.assertIsInstance(data[0].employee[0].salary, int)
+            self.assertEqual(data[0].employee[1].child.name, "Frank")
+            self.assertIsInstance(data[0].employee[1].child.name, str)
+            self.assertEqual(data[0].founder.cars.car[0], 'Bugatti')
+            self.assertIsInstance(data[0].founder.cars.car[0], str)
+
+    def test_xml_loader_no_casting(self):
+        with (patch(OPEN, mock_open(read_data=file_content)),
+              patch(PATH_EXISTS, return_value=True),
+              patch(PATH_ISFILE, return_value=True)):
+            data = xml(file_path, cast_types=False)
+            self.assertEqual(len(data), 3)
+            self.assertEqual(data[0].salary, '80000')
+            self.assertIsInstance(data[0].salary, str)
+            self.assertEqual(data[1].child.name, "Frank")
+            self.assertIsInstance(data[1].child.name, str)
+            self.assertEqual(data[2].cars.car[0], 'Bugatti')
+            self.assertIsInstance(data[2].cars.car[0], str)
+
+    def test_xml_loader_is_iterable(self):
+        with (patch(OPEN, mock_open(read_data=file_content)),
+              patch(PATH_EXISTS, return_value=True),
+              patch(PATH_ISFILE, return_value=True)):
+            data = xml(file_path)
+            self.assertEqual(len(list(iter(data))), 3)
+
+    def test_xml_loader_with_empty_file(self):
+        with (patch(OPEN, mock_open(read_data="")),
+              patch(PATH_EXISTS, return_value=True),
+              patch(PATH_ISFILE, return_value=True)):
+            data = xml(file_path)
+            self.assertEqual(len(data), 0)
+
+    def test_xml_loader_with_invalid_path(self):
+        with self.assertRaises(FileNotFoundError):
+            xml('path/to/invalid.xml')
+
+    def test_xml_loader_with_no_file(self):
+        with self.assertRaises(ValueError):
+            xml('./')
+
+    def test_xml_loader_from_string(self):
+        data = xml(file_content, read_from_src=True)
+        self.assertEqual(len(data), 3)
+        self.assertEqual(data[0].salary, 80000)
+        self.assertIsInstance(data[0].salary, int)
+        self.assertEqual(data[1].child.name, "Frank")
+        self.assertIsInstance(data[1].child.name, str)
+        self.assertEqual(data[2].cars.car[0], 'Bugatti')
+        self.assertIsInstance(data[2].cars.car[0], str)
+
+    def test_xml_loader_from_empty_string(self):
+        with self.assertRaises(ParseError):
+            len(xml('', read_from_src=True))
diff --git a/tox.ini b/tox.ini
@@ -8,11 +8,12 @@ deps =
     optional.py
     joblib
     parameterized
+    defusedxml
 commands =
     coverage run -m unittest discover -s tests -t tests --pattern 'test_*.py'
     coverage xml
 
 [coverage:run]
 relative_files = True
 source = pystreamapi/
-branch = True
+branch = True