From 5c6da040cf5647d05c389a47982cf9bbf2b24047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Wed, 24 May 2017 18:45:21 -0700 Subject: [PATCH] Add lazyness metadata to plugins --- rows/plugins/dicts.py | 3 +++ rows/plugins/ods.py | 5 +++++ rows/plugins/plugin_csv.py | 3 +++ rows/plugins/plugin_html.py | 4 ++++ rows/plugins/plugin_json.py | 6 ++++++ rows/plugins/plugin_parquet.py | 6 +++++- rows/plugins/sqlite.py | 3 +++ rows/plugins/txt.py | 7 +++++++ rows/plugins/utils.py | 7 +++++++ rows/plugins/xls.py | 3 +++ rows/plugins/xlsx.py | 4 ++++ rows/plugins/xpath.py | 4 ++++ tests/tests_plugin_csv.py | 1 + tests/tests_plugin_dicts.py | 1 + tests/tests_plugin_html.py | 5 +++-- tests/tests_plugin_json.py | 1 + tests/tests_plugin_ods.py | 1 + tests/tests_plugin_parquet.py | 1 + tests/tests_plugin_sqlite.py | 1 + tests/tests_plugin_txt.py | 1 + tests/tests_plugin_xls.py | 1 + tests/tests_plugin_xlsx.py | 1 + tests/tests_plugin_xpath.py | 4 ++-- 23 files changed, 68 insertions(+), 5 deletions(-) diff --git a/rows/plugins/dicts.py b/rows/plugins/dicts.py index 9dc676d9..36e5fd7e 100644 --- a/rows/plugins/dicts.py +++ b/rows/plugins/dicts.py @@ -51,6 +51,9 @@ def import_from_dicts(data, samples=1000, *args, **kwargs): *args, **kwargs) +import_from_dicts.is_lazy = True + + def export_to_dicts(table, *args, **kwargs): return [{key: getattr(row, key) for key in table.field_names} for row in table] diff --git a/rows/plugins/ods.py b/rows/plugins/ods.py index 76e37c32..cc291ae5 100644 --- a/rows/plugins/ods.py +++ b/rows/plugins/ods.py @@ -103,5 +103,10 @@ def import_from_ods(filename_or_fobj, index=0, *args, **kwargs): max_length = max(len(row) for row in table_rows) full_rows = complete_with_None(table_rows, max_length) + meta = {'imported_from': 'ods', 'filename': filename,} + return create_table(full_rows, meta=meta, *args, **kwargs) + + +import_from_ods.is_lazy = False diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py index 9bd18f72..0edf9176 100644 --- a/rows/plugins/plugin_csv.py +++ b/rows/plugins/plugin_csv.py @@ -119,6 +119,9 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, return create_table(reader, meta=meta, *args, **kwargs) +import_from_csv.is_lazy = True + + def export_to_csv(table, filename_or_fobj=None, encoding='utf-8', dialect=unicodecsv.excel, batch_size=100, *args, **kwargs): """Export a `rows.Table` to a CSV file diff --git a/rows/plugins/plugin_html.py b/rows/plugins/plugin_html.py index c57b407f..ae874cdd 100644 --- a/rows/plugins/plugin_html.py +++ b/rows/plugins/plugin_html.py @@ -98,6 +98,9 @@ def import_from_html(filename_or_fobj, encoding='utf-8', index=0, return create_table(table_rows, meta=meta, *args, **kwargs) +import_from_html.is_lazy = False + + def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args, **kwargs): serialized_table = serialize(table, *args, **kwargs) @@ -106,6 +109,7 @@ def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args, header = [' {} \n'.format(field) for field in fields] result.extend(header) result.extend([' \n', ' \n', '\n', ' \n', '\n']) + # TODO: could be lazy so we don't need to store the whole table into memory for index, row in enumerate(serialized_table, start=1): css_class = 'odd' if index % 2 == 1 else 'even' result.append(' \n'.format(css_class)) diff --git a/rows/plugins/plugin_json.py b/rows/plugins/plugin_json.py index 795b990e..e1947144 100644 --- a/rows/plugins/plugin_json.py +++ b/rows/plugins/plugin_json.py @@ -38,6 +38,7 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) + # TODO: may use import_from_dicts here field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] @@ -47,6 +48,9 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): return create_table([field_names] + table_rows, meta=meta, *args, **kwargs) +import_from_json.is_lazy = False + + def _convert(value, field_type, *args, **kwargs): if value is None or field_type in ( fields.BinaryField, @@ -77,6 +81,8 @@ def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None, fields = table.fields prepared_table = prepare_to_export(table, *args, **kwargs) field_names = next(prepared_table) + + # TODO: could be lazy so we don't need to store the whole table into memory data = [{field_name: _convert(value, fields[field_name], *args, **kwargs) for field_name, value in zip(field_names, row)} for row in prepared_table] diff --git a/rows/plugins/plugin_parquet.py b/rows/plugins/plugin_parquet.py index 2ceedcf3..6850bc45 100644 --- a/rows/plugins/plugin_parquet.py +++ b/rows/plugins/plugin_parquet.py @@ -56,8 +56,12 @@ def import_from_parquet(filename_or_fobj, *args, **kwargs): for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) - table_rows = list(parquet.reader(fobj)) # TODO: be lazy + # TODO: make it lazy + table_rows = list(parquet.reader(fobj)) meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs) + + +import_from_parquet.is_lazy = False diff --git a/rows/plugins/sqlite.py b/rows/plugins/sqlite.py index a5617efa..3a36ce9f 100644 --- a/rows/plugins/sqlite.py +++ b/rows/plugins/sqlite.py @@ -128,6 +128,9 @@ def import_from_sqlite(filename_or_connection, table_name='table1', query=None, return create_table(data, meta=meta, *args, **kwargs) +import_from_sqlite.is_lazy = True + + def export_to_sqlite(table, filename_or_connection, table_name=None, table_name_format='table{index}', batch_size=100, *args, **kwargs): diff --git a/rows/plugins/txt.py b/rows/plugins/txt.py index 3f2fb658..b3f09db3 100644 --- a/rows/plugins/txt.py +++ b/rows/plugins/txt.py @@ -34,18 +34,24 @@ def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs): filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') contents = fobj.read().decode(encoding).strip().splitlines() + # TODO: make it lazy + # remove '+----+----+' lines contents = contents[1:-1] del contents[1] table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]] for row in contents] + meta = {'imported_from': 'txt', 'filename': filename, 'encoding': encoding,} return create_table(table_rows, meta=meta, *args, **kwargs) +import_from_txt.is_lazy = False + + def export_to_txt(table, filename_or_fobj=None, encoding=None, *args, **kwargs): '''Export a `rows.Table` to text @@ -70,6 +76,7 @@ def export_to_txt(table, filename_or_fobj=None, encoding=None, split_line = PLUS + PLUS.join(dashes) + PLUS result = [split_line, header, split_line] + # TODO: make it lazy for row in table_rows: values = [value.rjust(max_sizes[field_name]) for field_name, value in zip(field_names, row)] diff --git a/rows/plugins/utils.py b/rows/plugins/utils.py index 10a59981..725468ed 100644 --- a/rows/plugins/utils.py +++ b/rows/plugins/utils.py @@ -141,7 +141,10 @@ def func(rows_data): def create_table(data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, lazy=False, *args, **kwargs): + # TODO: change samples to be a fixed number + # TODO: may change samples logic (`float('inf')` or `all`) # TODO: add auto_detect_types=True parameter + table_rows = iter(data) sample_rows = [] @@ -163,6 +166,9 @@ def create_table(data, meta=None, fields=None, skip_header=True, if not isinstance(fields, OrderedDict): raise ValueError('`fields` must be an `OrderedDict`') + # TODO: if `fields` is set, we're going to have the wrong order, + # compared to the first row (header). + if skip_header: _ = next(table_rows) @@ -187,6 +193,7 @@ def create_table(data, meta=None, fields=None, skip_header=True, if not lazy: table = Table(fields=fields, meta=meta) + # TODO: put this inside Table.__init__ for row in chain(sample_rows, table_rows): table.append({field_name: value diff --git a/rows/plugins/xls.py b/rows/plugins/xls.py index 7de2a1da..4398cea8 100644 --- a/rows/plugins/xls.py +++ b/rows/plugins/xls.py @@ -156,6 +156,9 @@ def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, return create_table(table_rows, meta=meta, *args, **kwargs) +import_from_xls.is_lazy = False + + def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): diff --git a/rows/plugins/xlsx.py b/rows/plugins/xlsx.py index 483ab64f..83f3bf56 100644 --- a/rows/plugins/xlsx.py +++ b/rows/plugins/xlsx.py @@ -77,12 +77,16 @@ def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, for row_index in range(start_row, end_row + 1)] filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) + metadata = {'imported_from': 'xlsx', 'filename': filename, 'sheet_name': sheet_name, } return create_table(table_rows, meta=metadata, *args, **kwargs) +import_from_xlsx.is_lazy = False + + FORMATTING_STYLES = { fields.DateField: 'YYYY-MM-DD', fields.DatetimeField: 'YYYY-MM-DD HH:MM:SS', diff --git a/rows/plugins/xpath.py b/rows/plugins/xpath.py index 2fba3b3b..5b04f258 100644 --- a/rows/plugins/xpath.py +++ b/rows/plugins/xpath.py @@ -69,6 +69,7 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') xml = fobj.read().decode(encoding) + # TODO: make it lazy (is it possible with lxml?) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) @@ -80,3 +81,6 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, 'filename': filename, 'encoding': encoding,} return create_table([header] + result_rows, meta=meta, *args, **kwargs) + + +import_from_xpath.is_lazy = False diff --git a/tests/tests_plugin_csv.py b/tests/tests_plugin_csv.py index 100ce52a..f4d73008 100644 --- a/tests/tests_plugin_csv.py +++ b/tests/tests_plugin_csv.py @@ -54,6 +54,7 @@ class PluginCsvTestCase(utils.RowsTestMixIn, unittest.TestCase): def test_imports(self): self.assertIs(rows.import_from_csv, rows.plugins.plugin_csv.import_from_csv) self.assertIs(rows.export_to_csv, rows.plugins.plugin_csv.export_to_csv) + self.assertTrue(rows.import_from_csv.is_lazy) @mock.patch('rows.plugins.plugin_csv.create_table') def test_import_from_csv_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_dicts.py b/tests/tests_plugin_dicts.py index 3302dc46..49e39dfa 100644 --- a/tests/tests_plugin_dicts.py +++ b/tests/tests_plugin_dicts.py @@ -46,6 +46,7 @@ def test_imports(self): self.assertIs(rows.import_from_dicts, rows.plugins.dicts.import_from_dicts) self.assertIs(rows.export_to_dicts, rows.plugins.dicts.export_to_dicts) + self.assertTrue(rows.import_from_dicts.is_lazy) @mock.patch('rows.plugins.dicts.create_table') def test_import_from_dicts_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_html.py b/tests/tests_plugin_html.py index a55da85a..d41a0754 100644 --- a/tests/tests_plugin_html.py +++ b/tests/tests_plugin_html.py @@ -1,6 +1,6 @@ # coding: utf-8 -# Copyright 2014-2016 Álvaro Justen +# Copyright 2014-2017 Álvaro Justen # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -54,6 +54,7 @@ def test_imports(self): self.assertIs(rows.import_from_html, rows.plugins.plugin_html.import_from_html) self.assertIs(rows.export_to_html, rows.plugins.plugin_html.export_to_html) + self.assertFalse(rows.import_from_html.is_lazy) def test_import_from_html_filename(self): table = rows.import_from_html(self.filename, encoding=self.encoding) @@ -89,7 +90,7 @@ def test_import_from_html_uses_create_table(self, mocked_create_table): call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'html', 'filename': self.filename, - 'encoding': 'iso-8859-1',} + 'encoding': 'iso-8859-1', } self.assertEqual(call[1], kwargs) def test_export_to_html_filename(self): diff --git a/tests/tests_plugin_json.py b/tests/tests_plugin_json.py index 57adc8a7..7795b064 100644 --- a/tests/tests_plugin_json.py +++ b/tests/tests_plugin_json.py @@ -47,6 +47,7 @@ def test_imports(self): rows.plugins.plugin_json.import_from_json) self.assertIs(rows.export_to_json, rows.plugins.plugin_json.export_to_json) + self.assertFalse(rows.import_from_json.is_lazy) @mock.patch('rows.plugins.plugin_json.create_table') def test_import_from_json_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_ods.py b/tests/tests_plugin_ods.py index c639184f..43c824c2 100644 --- a/tests/tests_plugin_ods.py +++ b/tests/tests_plugin_ods.py @@ -35,6 +35,7 @@ class PluginOdsTestCase(utils.RowsTestMixIn, unittest.TestCase): def test_imports(self): self.assertIs(rows.import_from_ods, rows.plugins.ods.import_from_ods) + self.assertFalse(rows.import_from_ods.is_lazy) @mock.patch('rows.plugins.ods.create_table') def test_import_from_ods_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_parquet.py b/tests/tests_plugin_parquet.py index 0fbc7769..b73775d3 100644 --- a/tests/tests_plugin_parquet.py +++ b/tests/tests_plugin_parquet.py @@ -63,6 +63,7 @@ class PluginParquetTestCase(unittest.TestCase): def test_imports(self): self.assertIs(rows.import_from_parquet, rows.plugins.plugin_parquet.import_from_parquet) + self.assertFalse(rows.import_from_parquet.is_lazy) @mock.patch('rows.plugins.plugin_parquet.create_table') def test_import_from_parquet_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_sqlite.py b/tests/tests_plugin_sqlite.py index 40a78425..771f11c9 100644 --- a/tests/tests_plugin_sqlite.py +++ b/tests/tests_plugin_sqlite.py @@ -50,6 +50,7 @@ def test_imports(self): rows.plugins.sqlite.import_from_sqlite) self.assertIs(rows.export_to_sqlite, rows.plugins.sqlite.export_to_sqlite) + self.assertTrue(rows.import_from_sqlite.is_lazy) @mock.patch('rows.plugins.sqlite.create_table') def test_import_from_sqlite_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_txt.py b/tests/tests_plugin_txt.py index f04bc2f1..d530cfdb 100644 --- a/tests/tests_plugin_txt.py +++ b/tests/tests_plugin_txt.py @@ -41,6 +41,7 @@ class PluginTxtTestCase(utils.RowsTestMixIn, unittest.TestCase): def test_imports(self): self.assertIs(rows.import_from_txt, rows.plugins.txt.import_from_txt) self.assertIs(rows.export_to_txt, rows.plugins.txt.export_to_txt) + self.assertFalse(rows.import_from_txt.is_lazy) @mock.patch('rows.plugins.txt.create_table') def test_import_from_txt_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_xls.py b/tests/tests_plugin_xls.py index 9ca55a16..9a3001f0 100644 --- a/tests/tests_plugin_xls.py +++ b/tests/tests_plugin_xls.py @@ -46,6 +46,7 @@ class PluginXlsTestCase(utils.RowsTestMixIn, unittest.TestCase): def test_imports(self): self.assertIs(rows.import_from_xls, rows.plugins.xls.import_from_xls) self.assertIs(rows.export_to_xls, rows.plugins.xls.export_to_xls) + self.assertFalse(rows.import_from_xls.is_lazy) @mock.patch('rows.plugins.xls.create_table') def test_import_from_xls_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_xlsx.py b/tests/tests_plugin_xlsx.py index 820694f8..bbd869f8 100644 --- a/tests/tests_plugin_xlsx.py +++ b/tests/tests_plugin_xlsx.py @@ -42,6 +42,7 @@ def test_imports(self): rows.plugins.xlsx.import_from_xlsx) self.assertIs(rows.export_to_xlsx, rows.plugins.xlsx.export_to_xlsx) + self.assertFalse(rows.import_from_xlsx.is_lazy) @mock.patch('rows.plugins.xlsx.create_table') def test_import_from_xlsx_uses_create_table(self, mocked_create_table): diff --git a/tests/tests_plugin_xpath.py b/tests/tests_plugin_xpath.py index aacd97d5..e9b08af9 100644 --- a/tests/tests_plugin_xpath.py +++ b/tests/tests_plugin_xpath.py @@ -107,9 +107,9 @@ def test_import_from_xpath_unescape_and_extract_text(self): fields_xpath = OrderedDict([('name', './/text()'), ('link', './/a/@href')]) table = rows.import_from_xpath(BytesIO(html), + encoding='utf-8', rows_xpath=rows_xpath, - fields_xpath=fields_xpath, - encoding='utf-8') + fields_xpath=fields_xpath) self.assertEqual(table[0].name, 'Abadia de Goiás (GO)') self.assertEqual(table[1].name, 'Abadiânia (GO)')