Skip to content

Commit

Permalink
Add CSV (#8)
Browse files Browse the repository at this point in the history
* Add venv/ to gitignore

* Copy csv.py from core

* Copy test_csv.py from core

* Update imports

* Add csv to readme, fix json loading example

* Add line break

* Minor version bump

* Fix readme examples

* Update bodylabs. to baiji. in comments

* Clean up method call
  • Loading branch information
jbwhite authored May 10, 2017
1 parent b5eabf2 commit 5e1a032
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,4 @@ target/
.env

/bodylabs-python-style/
/venv/
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Read and write common file formats to Amazon S3 and local files.
Features
--------

- Reads and writes Pickle, JSON, and YAML
- Reads and writes Pickle, CSV, JSON, and YAML
- Works without an S3 connection (with local files)
- Supports Python 2.7 and uses boto2
- Supports OS X, Linux, and Windows
Expand All @@ -22,15 +22,28 @@ from baiji.serialization import json
with open(filename, 'w') as f:
json.dump(foo, f)
with open(filename, 'r') as f:
foo = json.load(foo, f)
foo = json.load(f)
```

```py
from baiji.serialization import json
json.dump(filename)
json.dump(foo, filename)
foo = json.load(filename)
```

```py
from baiji.serialization import csv
with open(filename, 'w') as f:
csv.dump(foo, f)
with open(filename, 'r') as f:
foo = csv.load(f)
```

```py
from baiji.serialization import csv
csv.dump(foo, filename)
foo = csv.load(filename)
```

Development
-----------
Expand Down
2 changes: 1 addition & 1 deletion baiji/serialization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)

__version__ = '2.0.0'
__version__ = '2.1.0'
169 changes: 169 additions & 0 deletions baiji/serialization/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from __future__ import absolute_import

__all__ = ['load', 'dump', 'dumps', 'EXTENSION']

EXTENSION = '.csv'


def load(f, *args, **kwargs):
from baiji.serialization.util.openlib import ensure_file_open_and_call
return ensure_file_open_and_call(f, _load, mode='rb', *args, **kwargs)


def dump(obj, f):
from baiji.serialization.util.openlib import ensure_file_open_and_call
return ensure_file_open_and_call(f, _dump, mode='wb', obj=obj)

def dumps(obj):
import StringIO
output = StringIO.StringIO()
_dump(output, obj)
out_string = output.getvalue()
output.close()
return out_string

def _load(f, header_row=True, header_row_transformer=lambda x: x):
'''
header_row_transformer: Give the caller a chance to rewrite the header row.
Accepts one argument, a sequence of field names, and should return a
modified sequence.
'''
import csv

reader = csv.reader(f)
line_number = 1
result = []

if header_row:
field_names = next(reader)
field_names = header_row_transformer(field_names)
line_number += 1

for row_values in reader:
if len(row_values) != len(field_names):
raise ValueError("Header row contains %s items but line %s contains %s" % \
(len(field_names), line_number, len(row_values)))

result.append({k: v for k, v in zip(field_names, row_values)})
line_number += 1

else:
for row_values in reader:
result.append({i: v for i, v in enumerate(row_values)})
line_number += 1

return result


def _dump(f, obj):
'''
Per the docs for csv module, use this with binary mode on platforms
where it matters:
from baiji.serialization import csv
with open(file, 'wb') as f:
csv.dump(obj, f)
or
csv.dump(file, obj)
'''
import csv
if not isinstance(obj, list):
raise ValueError('obj should be a list of lists or tuples')
if not all([isinstance(x, tuple) or isinstance(x, list) for x in obj]):
raise ValueError('obj should be a list of lists or tuples')
writer = csv.writer(f)
for item in obj:
writer.writerow(item)


class CSVSerializer(object):
'''
Simple CSV serializer. Subclasses can support serializing arrays of
arbitrary objects, using their own serialization format. Subclasses
may also set header to an array of tuples which will be used as the
header content.
'''
header = []
def __init__(self, data):
self._data = data
@property
def body(self):
if hasattr(self, 'format'):
return self.format(self._data)
else:
return self._data
def render(self):
return self.header + self.body
def dump(self, f):
# Delegate to baiji.serialization.csv.dump()
dump(self.render(), f)


class CSVCollectionSerializer(CSVSerializer):
'''
Serialize to CSV from a collection of dicts. Dicts should have the
same keys, which become the column headings.
'''
def __init__(self, collection, row_ordering=None):
'''
row_ordering: When collection is a dictionary, an optional array
of keys specifying the order in which to emit the items.
'''
super(CSVCollectionSerializer, self).__init__(collection)
self.keys = self.compute_keys(collection)
if isinstance(collection, dict):
self.header = [[''] + self.keys]
else:
self.header = [self.keys]
self.row_ordering = row_ordering

@classmethod
def compute_keys(cls, collection):
if isinstance(collection, dict):
first_value = next(collection.itervalues())
else:
first_value = collection[0]
result = sorted(first_value.keys())

# Make sure the keys are consistent.
if isinstance(collection, dict):
keyed_collection = collection
else:
keyed_collection = {index: value for index, value in enumerate(collection)}

expected = set(result)
for key, item in keyed_collection.iteritems():
if set(item.keys()) != expected:
message = 'Item %s had different keys (got %s, expected %s)' % \
(key, ' '.join(item.keys()), ' '.join(expected))
raise ValueError(message)

return result

def format(self, collection):
if isinstance(collection, dict):
if self.row_ordering is not None:
row_ordering = self.row_ordering
else:
row_ordering = collection.keys()
row_heads = [[key] for key in row_ordering]
rows = [collection[key] for key in row_ordering]
else:
row_heads = [[] for _ in range(len(collection))]
rows = collection

return [
row_head +
[
row[k]
for k in self.keys
]
for row_head, row in zip(row_heads, rows)
]
52 changes: 52 additions & 0 deletions baiji/serialization/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import unittest

class TestCSV(unittest.TestCase):

def test_collection_csv_serializer_renders_correct_rows(self):
from baiji.serialization.csv import CSVCollectionSerializer

example_data = {
'one': {'foo': 'baz1', 'bar': 'inga1'},
'three': {'foo': 'baz3', 'bar': 'inga3'},
'two': {'foo': 'baz2', 'bar': 'inga2'},
}
ordering = ['one', 'two', 'three']

serializer = CSVCollectionSerializer(
collection=example_data,
row_ordering=ordering
)

expected = [
['', 'bar', 'foo'],
['one', 'inga1', 'baz1'],
['two', 'inga2', 'baz2'],
['three', 'inga3', 'baz3'],
]

self.assertEqual(serializer.render(), expected)

def test_collection_csv_serializer_works_with_row_ordering_np_array(self):
import numpy as np
from baiji.serialization.csv import CSVCollectionSerializer

example_data = {
'one': {'foo': 'baz1', 'bar': 'inga1'},
'three': {'foo': 'baz3', 'bar': 'inga3'},
'two': {'foo': 'baz2', 'bar': 'inga2'},
}
ordering = np.array(['one', 'two', 'three'])

serializer = CSVCollectionSerializer(
collection=example_data,
row_ordering=ordering
)

expected = [
['', 'bar', 'foo'],
['one', 'inga1', 'baz1'],
['two', 'inga2', 'baz2'],
['three', 'inga3', 'baz3'],
]

self.assertEqual(serializer.render(), expected)

0 comments on commit 5e1a032

Please sign in to comment.