Skip to content

Commit

Permalink
Add support for reading from TAR archives to flatdata-py (#182) (#210)
Browse files Browse the repository at this point in the history
Signed-off-by: Christian Ocker <[email protected]>

Co-authored-by: Christian Vetter <[email protected]>
  • Loading branch information
fermeise and VeaaC authored Oct 8, 2021
1 parent 7e1adee commit 5df78d8
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 13 deletions.
6 changes: 6 additions & 0 deletions flatdata-py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

Python 3 implementation of [flatdata](https://github.com/heremaps/flatdata).

## Running the tests

```sh
python3 -m nose
```

## Basic usage

Once you have [created a flatdata schema file](../README.md#creating-a-schema), you can generate a Python module to read your existing `flatdata` archive:
Expand Down
3 changes: 3 additions & 0 deletions flatdata-py/flatdata/lib/file_resource_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,6 @@ def get(self, key, is_optional=False):
return self.memory_map(filename)

return FileResourceStorage(filename)

def ls(self):
return os.listdir(self.path)
31 changes: 18 additions & 13 deletions flatdata-py/flatdata/lib/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas as pd

from .file_resource_storage import FileResourceStorage
from .tar_archive_resource_storage import TarArchiveResourceStorage
from flatdata.generator.engine import Engine
from flatdata.generator.tree.errors import FlatdataSyntaxError

Expand Down Expand Up @@ -42,9 +43,14 @@ def open_archive(path, archive=None, module_name=None):
if not os.path.exists(path):
raise RuntimeError("Specified non-existent path %s" % path)

archive_path = path if os.path.isdir(path) else os.path.dirname(path)
signatures = [p for p in os.listdir(
archive_path) if fnmatch.fnmatch(p, "*.archive")]
is_tar = path.endswith(".tar") and not os.path.isdir(path)
archive_path = path if is_tar or os.path.isdir(path) else os.path.dirname(path)
if is_tar:
storage = TarArchiveResourceStorage.create(archive_path)
else:
storage = FileResourceStorage(archive_path)

signatures = [p for p in storage.ls() if fnmatch.fnmatch(p, "*.archive")]

if not signatures:
raise RuntimeError("No archives located at path %s" % path)
Expand All @@ -62,17 +68,16 @@ def open_archive(path, archive=None, module_name=None):
raise RuntimeError("Specified archive not found at path.")

archive_name, _ = signatures[matching].rsplit('.', 1)
schema_filename = os.path.join(
archive_path, signatures[matching] + ".schema")
schema = storage.get(signatures[matching] + ".schema")

with open(schema_filename) as input_file:
try:
module, archive_type = \
Engine(input_file.read()).render_python_module(module_name=module_name,
archive_name=archive_name)
except FlatdataSyntaxError as err:
raise RuntimeError("Error reading schema: %s " % err)
archive = archive_type(FileResourceStorage(archive_path))
try:
module, archive_type = \
Engine(schema.read().decode()).render_python_module(module_name=module_name,
archive_name=archive_name)
except FlatdataSyntaxError as err:
raise RuntimeError("Error reading schema: %s " % err)

archive = archive_type(storage)
return archive, module


Expand Down
112 changes: 112 additions & 0 deletions flatdata-py/flatdata/lib/tar_archive_resource_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
'''
Copyright (c) 2021 HERE Europe B.V.
See the LICENSE file in the root of this project for license details.
'''

import tarfile

from .errors import CorruptResourceError
from .errors import MissingResourceError
from .file_resource_storage import FileResourceStorage


class TarArchiveResourceStorage:
"""
Resource storage based on a memory-mapped TAR archive.
"""

def __init__(self, tar_map, file_entries, dir_entries, sub_path):
self.tar_map = tar_map
self.file_entries = file_entries
self.dir_entries = dir_entries
self.sub_path = sub_path

@classmethod
def create(cls, tar_path, sub_path=""):
tar_map = FileResourceStorage.memory_map(tar_path)
file_entries = dict()
dir_entries = set()
with tarfile.open(tar_path, "r:") as tar:
for file in tar:
name = file.name
if name.startswith("./"):
name = name[2:]
if file.type == tarfile.GNUTYPE_SPARSE:
raise CorruptResourceError("Sparse files are not supported")
if file.isreg():
file_entries[name] = (file.offset_data, file.size)
if file.isdir() and name != ".":
dir_entries.add(name)

return cls(tar_map, file_entries, dir_entries, sub_path)

def get(self, key, is_optional=False):
path = self._path(key)
if path in self.file_entries:
(offset, length) = self.file_entries[path]
return MemoryMapSection(self.tar_map, offset, length)

if path in self.dir_entries:
return TarArchiveResourceStorage(self.tar_map, self.file_entries, self.dir_entries, path)

if not is_optional:
raise MissingResourceError(key)
else:
return None

def ls(self):
prefix = self._path("")
entries = []
for d in self.dir_entries:
if d.startswith(prefix) and '/' not in d[len(prefix):]:
entries.append(d[len(prefix):])
for f in self.file_entries:
if f.startswith(prefix) and '/' not in f[len(prefix):]:
entries.append(f[len(prefix):])
return entries

def _path(self, key):
if not self.sub_path:
return key
else:
return self.sub_path + '/' + key


class MemoryMapSection:
"""
Represent a slice of a memory mapped file.
Keeps track of its position, as to emulate pointing to a dedicated file.
"""

def __init__(self, inner, offset, length):
self.inner = inner
self.offset = offset
self.length = length;
self.pos = 0

def __len__(self):
return self.size()

def __getitem__(self, key):
if isinstance(key, slice):
start = key.start if key.start is not None else 0
start = self.offset + min(start, self.length)
stop = key.stop if key.stop is not None else self.length
stop = self.offset + min(stop, self.length)
return self.inner[slice(start, stop, key.step)]
else:
if key < self.length:
return self.inner.__getitem__(self.offset + key)
else:
raise IndexError('index out of range')

def read(self, n=None):
if n is None:
n = self.length - self.pos
self.inner.seek(self.offset + self.pos)
data = self.inner.read(min(n, self.length - self.pos))
self.pos += len(data)
return data

def size(self):
return min(self.length, self.inner.size() - self.offset)
40 changes: 40 additions & 0 deletions flatdata-py/tests/test_tar_resource_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from common import *
from flatdata.generator.engine import Engine
from flatdata.lib.tar_archive_resource_storage import TarArchiveResourceStorage

from nose.tools import eq_
import tarfile
import tempfile
import os


def check_signed_struct(s):
eq_(-0x1, s.a)
eq_(0x01234567, s.b)
eq_(-0x28, s.c)
eq_(0, s.d)


def test_tar_resource_storage():
module = Engine(INSTANCE_TEST_SCHEMA).render_python_module()
valid_data = {
"Archive.archive": ARCHIVE_SIGNATURE_PAYLOAD,
"Archive.archive.schema": module.backward_compatibility_Archive.schema().encode(),
"resource": RESOURCE_PAYLOAD,
"resource.schema": module.backward_compatibility_Archive.resource_schema('resource').encode()
}

with tempfile.TemporaryDirectory() as tmpdir:
archive_path = os.path.join(tmpdir, "archive.tar")
cwd = os.getcwd()
os.chdir(tmpdir)
tar = tarfile.open(archive_path, "w")
for key, value in valid_data.items():
with open(os.path.join(tmpdir, key), "wb") as file:
file.write(value)
tar.add(key)
tar.close()
os.chdir(cwd)

archive = module.backward_compatibility_Archive(TarArchiveResourceStorage.create(archive_path))
check_signed_struct(archive.resource)

0 comments on commit 5df78d8

Please sign in to comment.