Add CSV (#8)

* Add venv/ to gitignore * Copy csv.py from core * Copy test_csv.py from core * Update imports * Add csv to readme, fix json loading example * Add line break * Minor version bump * Fix readme examples * Update bodylabs. to baiji. in comments * Clean up method call
bodylabs · May 10, 2017 · 5e1a032 · 5e1a032
1 parent b5eabf2
commit 5e1a032
Show file tree

Hide file tree

Showing 5 changed files with 239 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -104,3 +104,4 @@ target/
 .env
 
 /bodylabs-python-style/
+/venv/
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Read and write common file formats to Amazon S3 and local files.
 Features
 --------
 
-- Reads and writes Pickle, JSON, and YAML
+- Reads and writes Pickle, CSV, JSON, and YAML
 - Works without an S3 connection (with local files)
 - Supports Python 2.7 and uses boto2
 - Supports OS X, Linux, and Windows
@@ -22,15 +22,28 @@ from baiji.serialization import json
 with open(filename, 'w') as f:
     json.dump(foo, f)
 with open(filename, 'r') as f:
-    foo = json.load(foo, f)
+    foo = json.load(f)
 ```
 
 ```py
 from baiji.serialization import json
-json.dump(filename)
+json.dump(foo, filename)
 foo = json.load(filename)
 ```
 
+```py
+from baiji.serialization import csv
+with open(filename, 'w') as f:
+    csv.dump(foo, f)
+with open(filename, 'r') as f:
+    foo = csv.load(f)
+```
+
+```py
+from baiji.serialization import csv
+csv.dump(foo, filename)
+foo = csv.load(filename)
+```
 
 Development
 -----------

diff --git a/baiji/serialization/__init__.py b/baiji/serialization/__init__.py
@@ -7,4 +7,4 @@
 from pkgutil import extend_path
 __path__ = extend_path(__path__, __name__)
 
-__version__ = '2.0.0'
+__version__ = '2.1.0'
diff --git a/baiji/serialization/csv.py b/baiji/serialization/csv.py
@@ -0,0 +1,169 @@
+from __future__ import absolute_import
+
+__all__ = ['load', 'dump', 'dumps', 'EXTENSION']
+
+EXTENSION = '.csv'
+
+
+def load(f, *args, **kwargs):
+    from baiji.serialization.util.openlib import ensure_file_open_and_call
+    return ensure_file_open_and_call(f, _load, mode='rb', *args, **kwargs)
+
+
+def dump(obj, f):
+    from baiji.serialization.util.openlib import ensure_file_open_and_call
+    return ensure_file_open_and_call(f, _dump, mode='wb', obj=obj)
+
+def dumps(obj):
+    import StringIO
+    output = StringIO.StringIO()
+    _dump(output, obj)
+    out_string = output.getvalue()
+    output.close()
+    return out_string
+
+def _load(f, header_row=True, header_row_transformer=lambda x: x):
+    '''
+    header_row_transformer: Give the caller a chance to rewrite the header row.
+      Accepts one argument, a sequence of field names, and should return a
+      modified sequence.
+
+    '''
+    import csv
+
+    reader = csv.reader(f)
+    line_number = 1
+    result = []
+
+    if header_row:
+        field_names = next(reader)
+        field_names = header_row_transformer(field_names)
+        line_number += 1
+
+        for row_values in reader:
+            if len(row_values) != len(field_names):
+                raise ValueError("Header row contains %s items but line %s contains %s" % \
+                    (len(field_names), line_number, len(row_values)))
+
+            result.append({k: v for k, v in zip(field_names, row_values)})
+            line_number += 1
+
+    else:
+        for row_values in reader:
+            result.append({i: v for i, v in enumerate(row_values)})
+            line_number += 1
+
+    return result
+
+
+def _dump(f, obj):
+    '''
+    Per the docs for csv module, use this with binary mode on platforms
+    where it matters:
+
+        from baiji.serialization import csv
+        with open(file, 'wb') as f:
+            csv.dump(obj, f)
+
+        or
+
+        csv.dump(file, obj)
+
+    '''
+    import csv
+    if not isinstance(obj, list):
+        raise ValueError('obj should be a list of lists or tuples')
+    if not all([isinstance(x, tuple) or isinstance(x, list) for x in obj]):
+        raise ValueError('obj should be a list of lists or tuples')
+    writer = csv.writer(f)
+    for item in obj:
+        writer.writerow(item)
+
+
+class CSVSerializer(object):
+    '''
+    Simple CSV serializer. Subclasses can support serializing arrays of
+    arbitrary objects, using their own serialization format. Subclasses
+    may also set header to an array of tuples which will be used as the
+    header content.
+
+    '''
+    header = []
+    def __init__(self, data):
+        self._data = data
+    @property
+    def body(self):
+        if hasattr(self, 'format'):
+            return self.format(self._data)
+        else:
+            return self._data
+    def render(self):
+        return self.header + self.body
+    def dump(self, f):
+        # Delegate to baiji.serialization.csv.dump()
+        dump(self.render(), f)
+
+
+class CSVCollectionSerializer(CSVSerializer):
+    '''
+    Serialize to CSV from a collection of dicts. Dicts should have the
+    same keys, which become the column headings.
+
+    '''
+    def __init__(self, collection, row_ordering=None):
+        '''
+        row_ordering: When collection is a dictionary, an optional array
+          of keys specifying the order in which to emit the items.
+
+        '''
+        super(CSVCollectionSerializer, self).__init__(collection)
+        self.keys = self.compute_keys(collection)
+        if isinstance(collection, dict):
+            self.header = [[''] + self.keys]
+        else:
+            self.header = [self.keys]
+        self.row_ordering = row_ordering
+
+    @classmethod
+    def compute_keys(cls, collection):
+        if isinstance(collection, dict):
+            first_value = next(collection.itervalues())
+        else:
+            first_value = collection[0]
+        result = sorted(first_value.keys())
+
+        # Make sure the keys are consistent.
+        if isinstance(collection, dict):
+            keyed_collection = collection
+        else:
+            keyed_collection = {index: value for index, value in enumerate(collection)}
+
+        expected = set(result)
+        for key, item in keyed_collection.iteritems():
+            if set(item.keys()) != expected:
+                message = 'Item %s had different keys (got %s, expected %s)' % \
+                    (key, ' '.join(item.keys()), ' '.join(expected))
+                raise ValueError(message)
+
+        return result
+
+    def format(self, collection):
+        if isinstance(collection, dict):
+            if self.row_ordering is not None:
+                row_ordering = self.row_ordering
+            else:
+                row_ordering = collection.keys()
+            row_heads = [[key] for key in row_ordering]
+            rows = [collection[key] for key in row_ordering]
+        else:
+            row_heads = [[] for _ in range(len(collection))]
+            rows = collection
+
+        return [
+            row_head +
+            [
+                row[k]
+                for k in self.keys
+            ]
+            for row_head, row in zip(row_heads, rows)
+        ]
diff --git a/baiji/serialization/test_csv.py b/baiji/serialization/test_csv.py
@@ -0,0 +1,52 @@
+import unittest
+
+class TestCSV(unittest.TestCase):
+
+    def test_collection_csv_serializer_renders_correct_rows(self):
+        from baiji.serialization.csv import CSVCollectionSerializer
+
+        example_data = {
+            'one': {'foo': 'baz1', 'bar': 'inga1'},
+            'three': {'foo': 'baz3', 'bar': 'inga3'},
+            'two': {'foo': 'baz2', 'bar': 'inga2'},
+        }
+        ordering = ['one', 'two', 'three']
+
+        serializer = CSVCollectionSerializer(
+            collection=example_data,
+            row_ordering=ordering
+        )
+
+        expected = [
+            ['', 'bar', 'foo'],
+            ['one', 'inga1', 'baz1'],
+            ['two', 'inga2', 'baz2'],
+            ['three', 'inga3', 'baz3'],
+        ]
+
+        self.assertEqual(serializer.render(), expected)
+
+    def test_collection_csv_serializer_works_with_row_ordering_np_array(self):
+        import numpy as np
+        from baiji.serialization.csv import CSVCollectionSerializer
+
+        example_data = {
+            'one': {'foo': 'baz1', 'bar': 'inga1'},
+            'three': {'foo': 'baz3', 'bar': 'inga3'},
+            'two': {'foo': 'baz2', 'bar': 'inga2'},
+        }
+        ordering = np.array(['one', 'two', 'three'])
+
+        serializer = CSVCollectionSerializer(
+            collection=example_data,
+            row_ordering=ordering
+        )
+
+        expected = [
+            ['', 'bar', 'foo'],
+            ['one', 'inga1', 'baz1'],
+            ['two', 'inga2', 'baz2'],
+            ['three', 'inga3', 'baz3'],
+        ]
+
+        self.assertEqual(serializer.render(), expected)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -104,3 +104,4 @@ target/
		.env

		/bodylabs-python-style/
		/venv/