Add lazyness metadata to plugins

turicas · Sep 11, 2017 · 5c6da04 · 5c6da04
1 parent c22f6f7
commit 5c6da04
Show file tree

Hide file tree

Showing 23 changed files with 68 additions and 5 deletions.
diff --git a/rows/plugins/dicts.py b/rows/plugins/dicts.py
@@ -51,6 +51,9 @@ def import_from_dicts(data, samples=1000, *args, **kwargs):
             *args, **kwargs)
 
 
+import_from_dicts.is_lazy = True
+
+
 def export_to_dicts(table, *args, **kwargs):
     return [{key: getattr(row, key) for key in table.field_names}
             for row in table]
diff --git a/rows/plugins/ods.py b/rows/plugins/ods.py
@@ -103,5 +103,10 @@ def import_from_ods(filename_or_fobj, index=0, *args, **kwargs):
 
     max_length = max(len(row) for row in table_rows)
     full_rows = complete_with_None(table_rows, max_length)
+
     meta = {'imported_from': 'ods', 'filename': filename,}
+
     return create_table(full_rows, meta=meta, *args, **kwargs)
+
+
+import_from_ods.is_lazy = False
diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py
@@ -119,6 +119,9 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None,
     return create_table(reader, meta=meta, *args, **kwargs)
 
 
+import_from_csv.is_lazy = True
+
+
 def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
                   dialect=unicodecsv.excel, batch_size=100, *args, **kwargs):
     """Export a `rows.Table` to a CSV file

diff --git a/rows/plugins/plugin_html.py b/rows/plugins/plugin_html.py
@@ -98,6 +98,9 @@ def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_html.is_lazy = False
+
+
 def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
                    **kwargs):
     serialized_table = serialize(table, *args, **kwargs)
@@ -106,6 +109,7 @@ def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
     header = ['      <th> {} </th>\n'.format(field) for field in fields]
     result.extend(header)
     result.extend(['    </tr>\n', '  </thead>\n', '\n', '  <tbody>\n', '\n'])
+    # TODO: could be lazy so we don't need to store the whole table into memory
     for index, row in enumerate(serialized_table, start=1):
         css_class = 'odd' if index % 2 == 1 else 'even'
         result.append('    <tr class="{}">\n'.format(css_class))

diff --git a/rows/plugins/plugin_json.py b/rows/plugins/plugin_json.py
@@ -38,6 +38,7 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
     filename, fobj = get_filename_and_fobj(filename_or_fobj)
 
     json_obj = json.load(fobj, encoding=encoding)
+    # TODO: may use import_from_dicts here
     field_names = list(json_obj[0].keys())
     table_rows = [[item[key] for key in field_names] for item in json_obj]
 
@@ -47,6 +48,9 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
     return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_json.is_lazy = False
+
+
 def _convert(value, field_type, *args, **kwargs):
     if value is None or field_type in (
                 fields.BinaryField,
@@ -77,6 +81,8 @@ def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None,
     fields = table.fields
     prepared_table = prepare_to_export(table, *args, **kwargs)
     field_names = next(prepared_table)
+
+    # TODO: could be lazy so we don't need to store the whole table into memory
     data = [{field_name: _convert(value, fields[field_name], *args, **kwargs)
              for field_name, value in zip(field_names, row)}
             for row in prepared_table]

diff --git a/rows/plugins/plugin_parquet.py b/rows/plugins/plugin_parquet.py
@@ -56,8 +56,12 @@ def import_from_parquet(filename_or_fobj, *args, **kwargs):
                          for schema in parquet._read_footer(fobj).schema
                          if schema.type is not None])
     header = list(types.keys())
-    table_rows = list(parquet.reader(fobj))  # TODO: be lazy
+    # TODO: make it lazy
+    table_rows = list(parquet.reader(fobj))
 
     meta = {'imported_from': 'parquet', 'filename': filename,}
     return create_table([header] + table_rows, meta=meta, force_types=types,
                         *args, **kwargs)
+
+
+import_from_parquet.is_lazy = False
diff --git a/rows/plugins/sqlite.py b/rows/plugins/sqlite.py
@@ -128,6 +128,9 @@ def import_from_sqlite(filename_or_connection, table_name='table1', query=None,
     return create_table(data, meta=meta, *args, **kwargs)
 
 
+import_from_sqlite.is_lazy = True
+
+
 def export_to_sqlite(table, filename_or_connection, table_name=None,
                      table_name_format='table{index}', batch_size=100,
                      *args, **kwargs):

diff --git a/rows/plugins/txt.py b/rows/plugins/txt.py
@@ -34,18 +34,24 @@ def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs):
     filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
     contents = fobj.read().decode(encoding).strip().splitlines()
 
+    # TODO: make it lazy
+
     # remove '+----+----+' lines
     contents = contents[1:-1]
     del contents[1]
 
     table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]]
                   for row in contents]
+
     meta = {'imported_from': 'txt',
             'filename': filename,
             'encoding': encoding,}
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_txt.is_lazy = False
+
+
 def export_to_txt(table, filename_or_fobj=None, encoding=None,
                   *args, **kwargs):
     '''Export a `rows.Table` to text
@@ -70,6 +76,7 @@ def export_to_txt(table, filename_or_fobj=None, encoding=None,
     split_line = PLUS + PLUS.join(dashes) + PLUS
 
     result = [split_line, header, split_line]
+    # TODO: make it lazy
     for row in table_rows:
         values = [value.rjust(max_sizes[field_name])
                   for field_name, value in zip(field_names, row)]

diff --git a/rows/plugins/utils.py b/rows/plugins/utils.py
@@ -141,7 +141,10 @@ def func(rows_data):
 def create_table(data, meta=None, fields=None, skip_header=True,
                  import_fields=None, samples=None, force_types=None,
                  lazy=False, *args, **kwargs):
+    # TODO: change samples to be a fixed number
+    # TODO: may change samples logic (`float('inf')` or `all`)
     # TODO: add auto_detect_types=True parameter
+
     table_rows = iter(data)
     sample_rows = []
 
@@ -163,6 +166,9 @@ def create_table(data, meta=None, fields=None, skip_header=True,
         if not isinstance(fields, OrderedDict):
             raise ValueError('`fields` must be an `OrderedDict`')
 
+        # TODO: if `fields` is set, we're going to have the wrong order,
+        # compared to the first row (header).
+
         if skip_header:
             _ = next(table_rows)
 
@@ -187,6 +193,7 @@ def create_table(data, meta=None, fields=None, skip_header=True,
 
     if not lazy:
         table = Table(fields=fields, meta=meta)
+
         # TODO: put this inside Table.__init__
         for row in chain(sample_rows, table_rows):
             table.append({field_name: value

diff --git a/rows/plugins/xls.py b/rows/plugins/xls.py
@@ -156,6 +156,9 @@ def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_xls.is_lazy = False
+
+
 def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
                   **kwargs):
 

diff --git a/rows/plugins/xlsx.py b/rows/plugins/xlsx.py
@@ -77,12 +77,16 @@ def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0,
                   for row_index in range(start_row, end_row + 1)]
 
     filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
+
     metadata = {'imported_from': 'xlsx',
                 'filename': filename,
                 'sheet_name': sheet_name, }
     return create_table(table_rows, meta=metadata, *args, **kwargs)
 
 
+import_from_xlsx.is_lazy = False
+
+
 FORMATTING_STYLES = {
         fields.DateField: 'YYYY-MM-DD',
         fields.DatetimeField: 'YYYY-MM-DD HH:MM:SS',

diff --git a/rows/plugins/xpath.py b/rows/plugins/xpath.py
@@ -69,6 +69,7 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
 
     filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
     xml = fobj.read().decode(encoding)
+    # TODO: make it lazy (is it possible with lxml?)
     tree = tree_from_string(xml)
     row_elements = tree.xpath(rows_xpath)
 
@@ -80,3 +81,6 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
             'filename': filename,
             'encoding': encoding,}
     return create_table([header] + result_rows, meta=meta, *args, **kwargs)
+
+
+import_from_xpath.is_lazy = False
diff --git a/tests/tests_plugin_csv.py b/tests/tests_plugin_csv.py
@@ -54,6 +54,7 @@ class PluginCsvTestCase(utils.RowsTestMixIn, unittest.TestCase):
     def test_imports(self):
         self.assertIs(rows.import_from_csv, rows.plugins.plugin_csv.import_from_csv)
         self.assertIs(rows.export_to_csv, rows.plugins.plugin_csv.export_to_csv)
+        self.assertTrue(rows.import_from_csv.is_lazy)
 
     @mock.patch('rows.plugins.plugin_csv.create_table')
     def test_import_from_csv_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_dicts.py b/tests/tests_plugin_dicts.py
@@ -46,6 +46,7 @@ def test_imports(self):
         self.assertIs(rows.import_from_dicts,
                       rows.plugins.dicts.import_from_dicts)
         self.assertIs(rows.export_to_dicts, rows.plugins.dicts.export_to_dicts)
+        self.assertTrue(rows.import_from_dicts.is_lazy)
 
     @mock.patch('rows.plugins.dicts.create_table')
     def test_import_from_dicts_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_html.py b/tests/tests_plugin_html.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-# Copyright 2014-2016 Álvaro Justen <https://github.com/turicas/rows/>
+# Copyright 2014-2017 Álvaro Justen <https://github.com/turicas/rows/>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
@@ -54,6 +54,7 @@ def test_imports(self):
         self.assertIs(rows.import_from_html,
                       rows.plugins.plugin_html.import_from_html)
         self.assertIs(rows.export_to_html, rows.plugins.plugin_html.export_to_html)
+        self.assertFalse(rows.import_from_html.is_lazy)
 
     def test_import_from_html_filename(self):
         table = rows.import_from_html(self.filename, encoding=self.encoding)
@@ -89,7 +90,7 @@ def test_import_from_html_uses_create_table(self, mocked_create_table):
         call = mocked_create_table.call_args
         kwargs['meta'] = {'imported_from': 'html',
                           'filename': self.filename,
-                          'encoding': 'iso-8859-1',}
+                          'encoding': 'iso-8859-1', }
         self.assertEqual(call[1], kwargs)
 
     def test_export_to_html_filename(self):

diff --git a/tests/tests_plugin_json.py b/tests/tests_plugin_json.py
@@ -47,6 +47,7 @@ def test_imports(self):
                       rows.plugins.plugin_json.import_from_json)
         self.assertIs(rows.export_to_json,
                       rows.plugins.plugin_json.export_to_json)
+        self.assertFalse(rows.import_from_json.is_lazy)
 
     @mock.patch('rows.plugins.plugin_json.create_table')
     def test_import_from_json_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_ods.py b/tests/tests_plugin_ods.py
@@ -35,6 +35,7 @@ class PluginOdsTestCase(utils.RowsTestMixIn, unittest.TestCase):
 
     def test_imports(self):
         self.assertIs(rows.import_from_ods, rows.plugins.ods.import_from_ods)
+        self.assertFalse(rows.import_from_ods.is_lazy)
 
     @mock.patch('rows.plugins.ods.create_table')
     def test_import_from_ods_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_parquet.py b/tests/tests_plugin_parquet.py
@@ -63,6 +63,7 @@ class PluginParquetTestCase(unittest.TestCase):
     def test_imports(self):
         self.assertIs(rows.import_from_parquet,
                       rows.plugins.plugin_parquet.import_from_parquet)
+        self.assertFalse(rows.import_from_parquet.is_lazy)
 
     @mock.patch('rows.plugins.plugin_parquet.create_table')
     def test_import_from_parquet_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_sqlite.py b/tests/tests_plugin_sqlite.py
@@ -50,6 +50,7 @@ def test_imports(self):
                       rows.plugins.sqlite.import_from_sqlite)
         self.assertIs(rows.export_to_sqlite,
                       rows.plugins.sqlite.export_to_sqlite)
+        self.assertTrue(rows.import_from_sqlite.is_lazy)
 
     @mock.patch('rows.plugins.sqlite.create_table')
     def test_import_from_sqlite_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_txt.py b/tests/tests_plugin_txt.py
@@ -41,6 +41,7 @@ class PluginTxtTestCase(utils.RowsTestMixIn, unittest.TestCase):
     def test_imports(self):
         self.assertIs(rows.import_from_txt, rows.plugins.txt.import_from_txt)
         self.assertIs(rows.export_to_txt, rows.plugins.txt.export_to_txt)
+        self.assertFalse(rows.import_from_txt.is_lazy)
 
     @mock.patch('rows.plugins.txt.create_table')
     def test_import_from_txt_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_xls.py b/tests/tests_plugin_xls.py
@@ -46,6 +46,7 @@ class PluginXlsTestCase(utils.RowsTestMixIn, unittest.TestCase):
     def test_imports(self):
         self.assertIs(rows.import_from_xls, rows.plugins.xls.import_from_xls)
         self.assertIs(rows.export_to_xls, rows.plugins.xls.export_to_xls)
+        self.assertFalse(rows.import_from_xls.is_lazy)
 
     @mock.patch('rows.plugins.xls.create_table')
     def test_import_from_xls_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_xlsx.py b/tests/tests_plugin_xlsx.py
@@ -42,6 +42,7 @@ def test_imports(self):
                       rows.plugins.xlsx.import_from_xlsx)
         self.assertIs(rows.export_to_xlsx,
                       rows.plugins.xlsx.export_to_xlsx)
+        self.assertFalse(rows.import_from_xlsx.is_lazy)
 
     @mock.patch('rows.plugins.xlsx.create_table')
     def test_import_from_xlsx_uses_create_table(self, mocked_create_table):

diff --git a/tests/tests_plugin_xpath.py b/tests/tests_plugin_xpath.py
@@ -107,9 +107,9 @@ def test_import_from_xpath_unescape_and_extract_text(self):
         fields_xpath = OrderedDict([('name', './/text()'),
                                     ('link', './/a/@href')])
         table = rows.import_from_xpath(BytesIO(html),
+                                       encoding='utf-8',
                                        rows_xpath=rows_xpath,
-                                       fields_xpath=fields_xpath,
-                                       encoding='utf-8')
+                                       fields_xpath=fields_xpath)
         self.assertEqual(table[0].name, 'Abadia de Goiás (GO)')
         self.assertEqual(table[1].name, 'Abadiânia (GO)')