From 7b7da4c0f54411b782b9f1cb997ffb6a9ab9f91a Mon Sep 17 00:00:00 2001 From: Zhihan Yue Date: Sun, 4 Feb 2024 20:40:45 +0800 Subject: [PATCH] Requiring pandas>=0.20.0 --- qgrid/grid.py | 27 +- qgrid/pd_json/__init__.py | 5 - qgrid/pd_json/json.py | 709 ---------------------------------- qgrid/pd_json/normalize.py | 265 ------------- qgrid/pd_json/table_schema.py | 178 --------- requirements.txt | 2 +- setup.py | 2 +- 7 files changed, 10 insertions(+), 1178 deletions(-) delete mode 100644 qgrid/pd_json/__init__.py delete mode 100644 qgrid/pd_json/json.py delete mode 100644 qgrid/pd_json/normalize.py delete mode 100644 qgrid/pd_json/table_schema.py diff --git a/qgrid/grid.py b/qgrid/grid.py index 0422995..6f72088 100644 --- a/qgrid/grid.py +++ b/qgrid/grid.py @@ -22,17 +22,6 @@ from uuid import uuid4 from six import string_types -# versions of pandas prior to version 0.20.0 don't support the orient='table' -# when calling the 'to_json' function on DataFrames. to get around this we -# have our own copy of the panda's 0.20.0 implementation that we use for old -# versions of pandas. -from distutils.version import LooseVersion -if LooseVersion(pd.__version__) > LooseVersion('0.20.0'): - import pandas.io.json as pd_json -else: - from . import pd_json - - class _DefaultSettings(object): def __init__(self): @@ -954,10 +943,10 @@ def should_be_stringified(col_series): else: self._row_styles = {} - df_json = pd_json.to_json(None, df, - orient='table', - date_format='iso', - double_precision=self.precision) + df_json = df.to_json(None, + orient='table', + date_format='iso', + double_precision=self.precision) if update_columns: self._interval_columns = [] @@ -1034,10 +1023,10 @@ def should_be_stringified(col_series): # and then call 'to_json' again to get a new version of the table # json that has interval columns replaced with text columns if len(self._interval_columns) > 0 or len(self._period_columns) > 0: - df_json = pd_json.to_json(None, df, - orient='table', - date_format='iso', - double_precision=self.precision) + df_json = df.to_json(None, + orient='table', + date_format='iso', + double_precision=self.precision) self._df_json = df_json diff --git a/qgrid/pd_json/__init__.py b/qgrid/pd_json/__init__.py deleted file mode 100644 index 32d110b..0000000 --- a/qgrid/pd_json/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .json import to_json, read_json, loads, dumps # noqa -from .normalize import json_normalize # noqa -from .table_schema import build_table_schema # noqa - -del json, normalize, table_schema # noqa diff --git a/qgrid/pd_json/json.py b/qgrid/pd_json/json.py deleted file mode 100644 index 8fb184a..0000000 --- a/qgrid/pd_json/json.py +++ /dev/null @@ -1,709 +0,0 @@ -# pylint: disable-msg=E1101,W0613,W0603 -import os -import numpy as np - -import pandas.json as json -from pandas.tslib import iNaT -from pandas.compat import StringIO, long, u -from pandas import compat, isnull -from pandas import Series, DataFrame, to_datetime, MultiIndex -from pandas.io.common import (get_filepath_or_buffer, _get_handle, - _stringify_path) -from pandas.core.common import AbstractMethodError -from pandas.formats.printing import pprint_thing -from .normalize import _convert_to_line_delimits -from .table_schema import build_table_schema - -loads = json.loads -dumps = json.dumps - -TABLE_SCHEMA_VERSION = '0.20.0' - - -# interface to/from -def to_json(path_or_buf, obj, orient=None, date_format='epoch', - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False): - - path_or_buf = _stringify_path(path_or_buf) - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") - - if orient == 'table' and isinstance(obj, Series): - obj = obj.to_frame(name=obj.name or 'values') - if orient == 'table' and isinstance(obj, DataFrame): - writer = JSONTableWriter - elif isinstance(obj, Series): - writer = SeriesWriter - elif isinstance(obj, DataFrame): - writer = FrameWriter - else: - raise NotImplementedError("'obj' should be a Series or a DataFrame") - - s = writer( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler).write() - - if lines: - s = _convert_to_line_delimits(s) - - if isinstance(path_or_buf, compat.string_types): - with open(path_or_buf, 'w') as fh: - fh.write(s) - elif path_or_buf is None: - return s - else: - path_or_buf.write(s) - - -class Writer(object): - - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, default_handler=None): - self.obj = obj - - if orient is None: - orient = self._default_orient - - self.orient = orient - self.date_format = date_format - self.double_precision = double_precision - self.ensure_ascii = ensure_ascii - self.date_unit = date_unit - self.default_handler = default_handler - - self.is_copy = None - self._format_axes() - - def _format_axes(self): - raise AbstractMethodError(self) - - def write(self): - return dumps( - self.obj, - orient=self.orient, - double_precision=self.double_precision, - ensure_ascii=self.ensure_ascii, - date_unit=self.date_unit, - iso_dates=self.date_format == 'iso', - default_handler=self.default_handler - ) - - -class SeriesWriter(Writer): - _default_orient = 'index' - - def _format_axes(self): - if not self.obj.index.is_unique and self.orient == 'index': - raise ValueError("Series index must be unique for orient=" - "'%s'" % self.orient) - - -class FrameWriter(Writer): - _default_orient = 'columns' - - def _format_axes(self): - """ try to axes if they are datelike """ - if not self.obj.index.is_unique and self.orient in ( - 'index', 'columns'): - raise ValueError("DataFrame index must be unique for orient=" - "'%s'." % self.orient) - if not self.obj.columns.is_unique and self.orient in ( - 'index', 'columns', 'records'): - raise ValueError("DataFrame columns must be unique for orient=" - "'%s'." % self.orient) - - -class JSONTableWriter(FrameWriter): - _default_orient = 'records' - - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, default_handler=None): - """ - Adds a `schema` attribut with the Table Schema, resets - the index (can't do in caller, because the schema inference needs - to know what the index is, forces orient to records, and forces - date_format to 'iso'. - """ - super(JSONTableWriter, self).__init__( - obj, orient, date_format, double_precision, ensure_ascii, - date_unit, default_handler=default_handler) - - if date_format != 'iso': - msg = ("Trying to write with `orient='table'` and " - "`date_format='%s'`. Table Schema requires dates " - "to be formatted with `date_format='iso'`" % date_format) - raise ValueError(msg) - - self.schema = build_table_schema(obj) - - # NotImplementd on a column MultiIndex - if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): - raise NotImplementedError( - "orient='table' is not supported for MultiIndex") - - # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or - len(obj.columns & obj.index.names)): - msg = "Overlapping names between the index and columns" - raise ValueError(msg) - - obj = obj.copy() - timedeltas = obj.select_dtypes(include=['timedelta']).columns - if len(timedeltas): - obj[timedeltas] = obj[timedeltas].applymap( - lambda x: x.isoformat()) - - self.obj = obj.reset_index() - self.date_format = 'iso' - self.orient = 'records' - - def write(self): - data = super(JSONTableWriter, self).write() - serialized = '{{"schema": {}, "data": {}}}'.format( - dumps(self.schema), data) - return serialized - - -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, - convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False): - """ - Convert a JSON string to pandas object - - Parameters - ---------- - path_or_buf : a valid JSON string or file-like, default: None - The string could be a URL. Valid URL schemes include http, ftp, s3, and - file. For file URLs, a host is expected. For instance, a local file - could be ``file://localhost/path/to/table.json`` - - orient : string, - Indication of expected JSON string format. - Compatible JSON strings can be produced by ``to_json()`` with a - corresponding orient value. - The set of possible orients is: - - - ``'split'`` : dict like - ``{index -> [index], columns -> [columns], data -> [values]}`` - - ``'records'`` : list like - ``[{column -> value}, ... , {column -> value}]`` - - ``'index'`` : dict like ``{index -> {column -> value}}`` - - ``'columns'`` : dict like ``{column -> {index -> value}}`` - - ``'values'`` : just the values array - - The allowed and default values depend on the value - of the `typ` parameter. - - * when ``typ == 'series'``, - - - allowed orients are ``{'split','records','index'}`` - - default is ``'index'`` - - The Series index must be unique for orient ``'index'``. - - * when ``typ == 'frame'``, - - - allowed orients are ``{'split','records','index', - 'columns','values'}`` - - default is ``'columns'`` - - The DataFrame index must be unique for orients ``'index'`` and - ``'columns'``. - - The DataFrame columns must be unique for orients ``'index'``, - ``'columns'``, and ``'records'``. - - typ : type of object to recover (series or frame), default 'frame' - dtype : boolean or dict, default True - If True, infer dtypes, if a dict of column to dtype, then use those, - if False, then don't infer dtypes at all, applies only to the data. - convert_axes : boolean, default True - Try to convert the axes to the proper dtypes. - convert_dates : boolean, default True - List of columns to parse for dates; If True, then try to parse - datelike columns default is True; a column label is datelike if - - * it ends with ``'_at'``, - - * it ends with ``'_time'``, - - * it begins with ``'timestamp'``, - - * it is ``'modified'``, or - - * it is ``'date'`` - - keep_default_dates : boolean, default True - If parsing dates, then parse the default datelike columns - numpy : boolean, default False - Direct decoding to numpy arrays. Supports numeric data only, but - non-numeric column and index labels are supported. Note also that the - JSON ordering MUST be the same for each term if numpy=True. - precise_float : boolean, default False - Set to enable usage of higher precision (strtod) function when - decoding string to double values. Default (False) is to use fast but - less precise builtin functionality - date_unit : string, default None - The timestamp unit to detect if converting dates. The default behaviour - is to try and detect the correct precision, but if this is not desired - then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, - milliseconds, microseconds or nanoseconds respectively. - lines : boolean, default False - Read the file as a json object per line. - - .. versionadded:: 0.19.0 - - encoding : str, default is 'utf-8' - The encoding to use to decode py3 bytes. - - .. versionadded:: 0.19.0 - - Returns - ------- - result : Series or DataFrame, depending on the value of `typ`. - - See Also - -------- - DataFrame.to_json - - Examples - -------- - - >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) - - Encoding/decoding a Dataframe using ``'split'`` formatted JSON: - - >>> df.to_json(orient='split') - '{"columns":["col 1","col 2"], - "index":["row 1","row 2"], - "data":[["a","b"],["c","d"]]}' - >>> pd.read_json(_, orient='split') - col 1 col 2 - row 1 a b - row 2 c d - - Encoding/decoding a Dataframe using ``'index'`` formatted JSON: - - >>> df.to_json(orient='index') - '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' - >>> pd.read_json(_, orient='index') - col 1 col 2 - row 1 a b - row 2 c d - - Encoding/decoding a Dataframe using ``'records'`` formatted JSON. - Note that index labels are not preserved with this encoding. - - >>> df.to_json(orient='records') - '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' - >>> pd.read_json(_, orient='records') - col 1 col 2 - 0 a b - 1 c d - - Encoding with Table Schema - - >>> df.to_json(orient='table') - '{"schema": {"fields": [{"name": "index", "type": "string"}, - {"name": "col 1", "type": "string"}, - {"name": "col 2", "type": "string"}], - "primaryKey": "index", - "pandas_version": "0.20.0"}, - "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, - {"index": "row 2", "col 1": "c", "col 2": "d"}]}' - """ - - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, - encoding=encoding) - if isinstance(filepath_or_buffer, compat.string_types): - try: - exists = os.path.exists(filepath_or_buffer) - - # if the filepath is too long will raise here - # 5874 - except (TypeError, ValueError): - exists = False - - if exists: - fh, handles = _get_handle(filepath_or_buffer, 'r', - encoding=encoding) - json = fh.read() - fh.close() - else: - json = filepath_or_buffer - elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() - else: - json = filepath_or_buffer - - if lines: - # If given a json lines file, we break the string into lines, add - # commas and put it in a json list to make a valid json object. - lines = list(StringIO(json.strip())) - json = '[' + ','.join(lines) + ']' - - obj = None - if typ == 'frame': - obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() - - if typ == 'series' or obj is None: - if not isinstance(dtype, bool): - dtype = dict(data=dtype) - obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() - - return obj - - -class Parser(object): - - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') - _MIN_STAMPS = { - 's': long(31536000), - 'ms': long(31536000000), - 'us': long(31536000000000), - 'ns': long(31536000000000000)} - - def __init__(self, json, orient, dtype=True, convert_axes=True, - convert_dates=True, keep_default_dates=False, numpy=False, - precise_float=False, date_unit=None): - self.json = json - - if orient is None: - orient = self._default_orient - - self.orient = orient - self.dtype = dtype - - if orient == "split": - numpy = False - - if date_unit is not None: - date_unit = date_unit.lower() - if date_unit not in self._STAMP_UNITS: - raise ValueError('date_unit must be one of %s' % - (self._STAMP_UNITS,)) - self.min_stamp = self._MIN_STAMPS[date_unit] - else: - self.min_stamp = self._MIN_STAMPS['s'] - - self.numpy = numpy - self.precise_float = precise_float - self.convert_axes = convert_axes - self.convert_dates = convert_dates - self.date_unit = date_unit - self.keep_default_dates = keep_default_dates - self.obj = None - - def check_keys_split(self, decoded): - "checks that dict has only the appropriate keys for orient='split'" - bad_keys = set(decoded.keys()).difference(set(self._split_keys)) - if bad_keys: - bad_keys = ", ".join(bad_keys) - raise ValueError(u("JSON data had unexpected key(s): %s") % - pprint_thing(bad_keys)) - - def parse(self): - - # try numpy - numpy = self.numpy - if numpy: - self._parse_numpy() - - else: - self._parse_no_numpy() - - if self.obj is None: - return None - if self.convert_axes: - self._convert_axes() - self._try_convert_types() - return self.obj - - def _convert_axes(self): - """ try to convert axes """ - for axis in self.obj._AXIS_NUMBERS.keys(): - new_axis, result = self._try_convert_data( - axis, self.obj._get_axis(axis), use_dtypes=False, - convert_dates=True) - if result: - setattr(self.obj, axis, new_axis) - - def _try_convert_types(self): - raise AbstractMethodError(self) - - def _try_convert_data(self, name, data, use_dtypes=True, - convert_dates=True): - """ try to parse a ndarray like into a column by inferring dtype """ - - # don't try to coerce, unless a force conversion - if use_dtypes: - if self.dtype is False: - return data, False - elif self.dtype is True: - pass - - else: - - # dtype to force - dtype = (self.dtype.get(name) - if isinstance(self.dtype, dict) else self.dtype) - if dtype is not None: - try: - dtype = np.dtype(dtype) - return data.astype(dtype), True - except Exception: - return data, False - - if convert_dates: - new_data, result = self._try_convert_to_date(data) - if result: - return new_data, True - - result = False - - if data.dtype == 'object': - - # try float - try: - data = data.astype('float64') - result = True - except Exception: - pass - - if data.dtype.kind == 'f': - - if data.dtype != 'float64': - - # coerce floats to 64 - try: - data = data.astype('float64') - result = True - except Exception: - pass - - # do't coerce 0-len data - if len(data) and (data.dtype == 'float' or data.dtype == 'object'): - - # coerce ints if we can - try: - new_data = data.astype('int64') - if (new_data == data).all(): - data = new_data - result = True - except Exception: - pass - - # coerce ints to 64 - if data.dtype == 'int': - - # coerce floats to 64 - try: - data = data.astype('int64') - result = True - except Exception: - pass - - return data, result - - def _try_convert_to_date(self, data): - """ try to parse a ndarray like into a date column - try to coerce object in epoch/iso formats and - integer/float in epcoh formats, return a boolean if parsing - was successful """ - - # no conversion on empty - if not len(data): - return data, False - - new_data = data - if new_data.dtype == 'object': - try: - new_data = data.astype('int64') - except Exception: - pass - - # ignore numbers that are out of range - if issubclass(new_data.dtype.type, np.number): - in_range = (isnull(new_data.values) | (new_data > self.min_stamp) | - (new_data.values == iNaT)) - if not in_range.all(): - return data, False - - date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS - for date_unit in date_units: - try: - new_data = to_datetime(new_data, errors='raise', - unit=date_unit) - except ValueError: - continue - except Exception: - break - return new_data, True - return data, False - - def _try_convert_dates(self): - raise AbstractMethodError(self) - - -class SeriesParser(Parser): - _default_orient = 'index' - _split_keys = ('name', 'index', 'data') - - def _parse_no_numpy(self): - - json = self.json - orient = self.orient - if orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) - self.check_keys_split(decoded) - self.obj = Series(dtype=None, **decoded) - else: - self.obj = Series( - loads(json, precise_float=self.precise_float), dtype=None) - - def _parse_numpy(self): - - json = self.json - orient = self.orient - if orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) - self.check_keys_split(decoded) - self.obj = Series(**decoded) - elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) - else: - self.obj = Series(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) - - def _try_convert_types(self): - if self.obj is None: - return - obj, result = self._try_convert_data( - 'data', self.obj, convert_dates=self.convert_dates) - if result: - self.obj = obj - - -class FrameParser(Parser): - _default_orient = 'columns' - _split_keys = ('columns', 'index', 'data') - - def _parse_numpy(self): - - json = self.json - orient = self.orient - - if orient == "columns": - args = loads(json, dtype=None, numpy=True, labelled=True, - precise_float=self.precise_float) - if args: - args = (args[0].T, args[2], args[1]) - self.obj = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) - decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) - self.check_keys_split(decoded) - self.obj = DataFrame(**decoded) - elif orient == "values": - self.obj = DataFrame(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) - else: - self.obj = DataFrame(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) - - def _parse_no_numpy(self): - - json = self.json - orient = self.orient - - if orient == "columns": - self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) - self.check_keys_split(decoded) - self.obj = DataFrame(dtype=None, **decoded) - elif orient == "index": - self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None).T - else: - self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) - - def _process_converter(self, f, filt=lambda col, c: True): - """ take a conversion function and possibly recreate the frame """ - - needs_new_obj = False - new_obj = dict() - for i, (col, c) in enumerate(self.obj.iteritems()): - if filt(col, c): - new_data, result = f(col, c) - if result: - c = new_data - needs_new_obj = True - new_obj[i] = c - - if needs_new_obj: - - # possibly handle dup columns - new_obj = DataFrame(new_obj, index=self.obj.index) - new_obj.columns = self.obj.columns - self.obj = new_obj - - def _try_convert_types(self): - if self.obj is None: - return - if self.convert_dates: - self._try_convert_dates() - - self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False)) - - def _try_convert_dates(self): - if self.obj is None: - return - - # our columns to parse - convert_dates = self.convert_dates - if convert_dates is True: - convert_dates = [] - convert_dates = set(convert_dates) - - def is_ok(col): - """ return if this col is ok to try for a date parse """ - if not isinstance(col, compat.string_types): - return False - - col_lower = col.lower() - if (col_lower.endswith('_at') or - col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or - col_lower.startswith('timestamp')): - return True - return False - - self._process_converter( - lambda col, c: self._try_convert_to_date(c), - lambda col, c: ((self.keep_default_dates and is_ok(col)) or - col in convert_dates)) diff --git a/qgrid/pd_json/normalize.py b/qgrid/pd_json/normalize.py deleted file mode 100644 index 211c40b..0000000 --- a/qgrid/pd_json/normalize.py +++ /dev/null @@ -1,265 +0,0 @@ -# --------------------------------------------------------------------- -# JSON normalization routines - -import copy -from collections import defaultdict -import numpy as np - -from pandas import compat, DataFrame - - -def _convert_to_line_delimits(s): - """Helper function that converts json lists to line delimited json.""" - - # Determine we have a JSON list to turn to lines otherwise just return the - # json object, only lists can - if not s[0] == '[' and s[-1] == ']': - return s - s = s[1:-1] - - return s - - -def nested_to_record(ds, prefix="", sep=".", level=0): - """a simplified json_normalize - - converts a nested dict into a flat dict ("record"), unlike json_normalize, - it does not attempt to extract a subset of the data. - - Parameters - ---------- - ds : dict or list of dicts - prefix: the prefix, optional, default: "" - sep : string, default '.' - Nested records will generate names separated by sep, - e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar - - .. versionadded:: 0.20.0 - - level: the number of levels in the jason string, optional, default: 0 - - Returns - ------- - d - dict or list of dicts, matching `ds` - - Examples - -------- - - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), - nested=dict(e=dict(c=1,d=2),d=2))) - Out[52]: - {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} - """ - singleton = False - if isinstance(ds, dict): - ds = [ds] - singleton = True - - new_ds = [] - for d in ds: - - new_d = copy.deepcopy(d) - for k, v in d.items(): - # each key gets renamed with prefix - if not isinstance(k, compat.string_types): - k = str(k) - if level == 0: - newkey = k - else: - newkey = prefix + sep + k - - # only dicts gets recurse-flattend - # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): - if level != 0: # so we skip copying for top level, common case - v = new_d.pop(k) - new_d[newkey] = v - continue - else: - v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1)) - new_ds.append(new_d) - - if singleton: - return new_ds[0] - return new_ds - - -def json_normalize(data, record_path=None, meta=None, - meta_prefix=None, - record_prefix=None, - errors='raise', - sep='.'): - """ - "Normalize" semi-structured JSON data into a flat table - - Parameters - ---------- - data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None - Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - record_prefix : string, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] - meta_prefix : string, default None - errors : {'raise', 'ignore'}, default 'raise' - - * 'ignore' : will ignore KeyError if keys listed in meta are not - always present - * 'raise' : will raise KeyError if keys listed in meta are not - always present - - .. versionadded:: 0.20.0 - - sep : string, default '.' - Nested records will generate names separated by sep, - e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar - - .. versionadded:: 0.20.0 - - - Returns - ------- - frame : DataFrame - - Examples - -------- - - >>> data = [{'state': 'Florida', - ... 'shortname': 'FL', - ... 'info': { - ... 'governor': 'Rick Scott' - ... }, - ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, - ... {'state': 'Ohio', - ... 'shortname': 'OH', - ... 'info': { - ... 'governor': 'John Kasich' - ... }, - ... 'counties': [{'name': 'Summit', 'population': 1234}, - ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> from pandas.io.json import json_normalize - >>> result = json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) - >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH - - """ - def _pull_field(js, spec): - result = js - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] - - return result - - if isinstance(data, list) and len(data) == 0: - return DataFrame() - - # A bit of a hackjob - if isinstance(data, dict): - data = [data] - - if record_path is None: - if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): - # naive normalization, this is idempotent for flat records - # and potentially will inflate the data considerably for - # deeply nested structures: - # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} - # - # TODO: handle record value which are lists, at least error - # reasonably - data = nested_to_record(data, sep=sep) - return DataFrame(data) - elif not isinstance(record_path, list): - record_path = [record_path] - - if meta is None: - meta = [] - elif not isinstance(meta, list): - meta = [meta] - - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] - - # Disastrously inefficient for now - records = [] - lengths = [] - - meta_vals = defaultdict(list) - if not isinstance(sep, compat.string_types): - sep = str(sep) - meta_keys = [sep.join(val) for val in meta] - - def _recursive_extract(data, path, seen_meta, level=0): - if len(path) > 1: - for obj in data: - for val, key in zip(meta, meta_keys): - if level + 1 == len(val): - seen_meta[key] = _pull_field(obj, val[-1]) - - _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level + 1) - else: - for obj in data: - recs = _pull_field(obj, path[0]) - - # For repeating the metadata later - lengths.append(len(recs)) - - for val, key in zip(meta, meta_keys): - if level + 1 > len(val): - meta_val = seen_meta[key] - else: - try: - meta_val = _pull_field(obj, val[level:]) - except KeyError as e: - if errors == 'ignore': - meta_val = np.nan - else: - raise \ - KeyError("Try running with " - "errors='ignore' as key " - "%s is not always present", e) - meta_vals[key].append(meta_val) - - records.extend(recs) - - _recursive_extract(data, record_path, {}, level=0) - - result = DataFrame(records) - - if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) - - # Data types, a problem - for k, v in compat.iteritems(meta_vals): - if meta_prefix is not None: - k = meta_prefix + k - - if k in result: - raise ValueError('Conflicting metadata name %s, ' - 'need distinguishing prefix ' % k) - - result[k] = np.array(v).repeat(lengths) - - return result diff --git a/qgrid/pd_json/table_schema.py b/qgrid/pd_json/table_schema.py deleted file mode 100644 index c815008..0000000 --- a/qgrid/pd_json/table_schema.py +++ /dev/null @@ -1,178 +0,0 @@ -""" -Table Schema builders - -http://specs.frictionlessdata.io/json-table-schema/ -""" -from pandas.core.common import ( - is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype, - is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_categorical_dtype, is_string_dtype -) - - -def as_json_table_type(x): - """ - Convert a NumPy / pandas type to its corresponding json_table. - - Parameters - ---------- - x : array or dtype - - Returns - ------- - t : str - the Table Schema data types - - Notes - ----- - This table shows the relationship between NumPy / pandas dtypes, - and Table Schema dtypes. - - ============== ================= - Pandas type Table Schema type - ============== ================= - int64 integer - float64 number - bool boolean - datetime64[ns] datetime - timedelta64[ns] duration - object str - categorical any - =============== ================= - """ - if is_integer_dtype(x): - return 'integer' - elif is_bool_dtype(x): - return 'boolean' - elif is_numeric_dtype(x): - return 'number' - elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)): - return 'datetime' - elif is_timedelta64_dtype(x): - return 'duration' - elif is_categorical_dtype(x): - return 'any' - elif is_string_dtype(x): - return 'string' - else: - return 'any' - - -def set_default_names(data): - """Sets index names to 'index' for regular, or 'level_x' for Multi""" - if all(name is not None for name in data.index.names): - return data - - data = data.copy() - if data.index.nlevels > 1: - names = [name if name is not None else 'level_{}'.format(i) - for i, name in enumerate(data.index.names)] - data.index.names = names - else: - data.index.name = data.index.name or 'index' - return data - - -def make_field(arr, dtype=None): - dtype = dtype or arr.dtype - if arr.name is None: - name = 'values' - else: - name = arr.name - field = {'name': name, - 'type': as_json_table_type(dtype)} - - if is_categorical_dtype(arr): - if hasattr(arr, 'categories'): - cats = arr.categories - ordered = arr.ordered - else: - cats = arr.cat.categories - ordered = arr.cat.ordered - field['constraints'] = {"enum": list(cats)} - field['ordered'] = ordered - elif is_datetime64tz_dtype(arr): - if hasattr(arr, 'dt'): - field['tz'] = arr.dt.tz.zone - else: - field['tz'] = arr.tz.zone - return field - - -def build_table_schema(data, index=True, primary_key=None, version=True): - """ - Create a Table schema from ``data``. - - Parameters - ---------- - data : Series, DataFrame - index : bool, default True - Whether to include ``data.index`` in the schema. - primary_key : bool or None, default True - column names to designate as the primary key. - The default `None` will set `'primaryKey'` to the index - level or levels if the index is unique. - version : bool, default True - Whether to include a field `pandas_version` with the version - of pandas that generated the schema. - - Returns - ------- - schema : dict - - Examples - -------- - >>> df = pd.DataFrame( - ... {'A': [1, 2, 3], - ... 'B': ['a', 'b', 'c'], - ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), - ... }, index=pd.Index(range(3), name='idx')) - >>> build_table_schema(df) - {'fields': [{'name': 'idx', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}], - 'pandas_version': '0.20.0', - 'primaryKey': ['idx']} - - Notes - ----- - See `_as_json_table_type` for conversion types. - Timedeltas as converted to ISO8601 duration format with - 9 decimal places after the secnods field for nanosecond precision. - - Categoricals are converted to the `any` dtype, and use the `enum` field - constraint to list the allowed values. The `ordered` attribute is included - in an `ordered` field. - """ - if index is True: - data = set_default_names(data) - - schema = {} - fields = [] - - if index: - if data.index.nlevels > 1: - for level in data.index.levels: - fields.append(make_field(level)) - else: - fields.append(make_field(data.index)) - - if data.ndim > 1: - for column, s in data.iteritems(): - fields.append(make_field(s)) - else: - fields.append(make_field(data)) - - schema['fields'] = fields - if index and data.index.is_unique and primary_key is None: - if data.index.nlevels == 1: - schema['primaryKey'] = [data.index.name] - else: - schema['primaryKey'] = data.index.names - elif primary_key is not None: - schema['primaryKey'] = primary_key - - if version: - schema['pandas_version'] = '0.20.0' - return schema diff --git a/requirements.txt b/requirements.txt index 25dad02..ccc2642 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -pandas>=0.18.0 +pandas>=0.20.0 ipywidgets>=7.0.0 diff --git a/setup.py b/setup.py index 72b3797..2735b9e 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages, Command, find_namespace_packages +from setuptools import setup, find_packages, Command from setuptools.command.sdist import sdist from setuptools.command.build_py import build_py from setuptools.command.egg_info import egg_info