diff --git a/glom/core.py b/glom/core.py index a903b91..10f6bbd 100644 --- a/glom/core.py +++ b/glom/core.py @@ -23,7 +23,6 @@ import sys import pdb import copy -import weakref import operator from abc import ABCMeta from pprint import pprint @@ -613,7 +612,7 @@ def __init__(self, *path_parts): if isinstance(part, Path): part = part.path_t if isinstance(part, TType): - sub_parts = _T_PATHS[part] + sub_parts = object.__getattribute__(part, '__op_args__') if sub_parts[0] is not T: raise ValueError('path segment must be path from T, not %r' % sub_parts[0]) @@ -625,6 +624,9 @@ def __init__(self, *path_parts): path_t = _t_child(path_t, 'P', part) self.path_t = path_t + _CACHE = {} + _MAX_CACHE = 10000 + @classmethod def from_text(cls, text): """Make a Path from .-delimited text: @@ -633,20 +635,26 @@ def from_text(cls, text): Path('a', 'b', 'c') """ - return cls(*text.split('.')) + if text not in cls._CACHE: + if len(cls._CACHE) > cls._MAX_CACHE: + return cls(*text.split('.')) + cls._CACHE[text] = cls(*text.split('.')) + return cls._CACHE[text] def glomit(self, target, scope): # The entrypoint for the Path extension return _t_eval(target, self.path_t, scope) def __len__(self): - return (len(_T_PATHS[self.path_t]) - 1) // 2 + return (len(object.__getattribute__(self.path_t, '__op_args__')) - 1) // 2 def __eq__(self, other): + op_args = object.__getattribute__(self.path_t, '__op_args__') if type(other) is Path: - return _T_PATHS[self.path_t] == _T_PATHS[other.path_t] - elif type(other) is TType: - return _T_PATHS[self.path_t] == _T_PATHS[other] + other = other.path_t + if type(other) is TType: + other_op_args = object.__getattribute__(other, '__op_args__') + return op_args == other_op_args return False def __ne__(self, other): @@ -659,7 +667,7 @@ def values(self): >>> Path(T.a.b, 'c', T['d']).values() ('a', 'b', 'c', 'd') """ - cur_t_path = _T_PATHS[self.path_t] + cur_t_path = object.__getattribute__(self.path_t, '__op_args__') return cur_t_path[2::2] def items(self): @@ -670,7 +678,7 @@ def items(self): (('.', 'a'), ('.', 'b'), ('P', 'c'), ('[', 'd')) """ - cur_t_path = _T_PATHS[self.path_t] + cur_t_path = object.__getattribute__(self.path_t, '__op_args__') return tuple(zip(cur_t_path[1::2], cur_t_path[2::2])) def startswith(self, other): @@ -680,20 +688,19 @@ def startswith(self, other): other = other.path_t if not isinstance(other, TType): raise TypeError('can only check if Path starts with string, Path or T') - o_path = _T_PATHS[other] - return _T_PATHS[self.path_t][:len(o_path)] == o_path + o_path = object.__getattribute__(other, '__op_args__') + path = object.__getattribute__(other, '__op_args__') + return path[:len(o_path)] == o_path def from_t(self): '''return the same path but starting from T''' - t_path = _T_PATHS[self.path_t] - if t_path[0] is S: - new_t = TType() - _T_PATHS[new_t] = (T,) + t_path[1:] - return Path(new_t) - return self + t_path = object.__getattribute__(self.path_t, '__op_args__') + if t_path[0] is T: + return self + return Path(TType((T,) + t_path[1:])) def __getitem__(self, i): - cur_t_path = _T_PATHS[self.path_t] + cur_t_path = object.__getattribute__(self.path_t, '__op_args__') try: step = i.step start = i.start if i.start is not None else 0 @@ -709,16 +716,14 @@ def __getitem__(self, i): raise IndexError('Path index out of range') stop = ((i + 1) * 2) + 1 if i >= 0 else ((i + 1) * 2) + len(cur_t_path) - new_t = TType() new_path = cur_t_path[start:stop] if step is not None and step != 1: new_path = tuple(zip(new_path[::2], new_path[1::2]))[::step] new_path = sum(new_path, ()) - _T_PATHS[new_t] = (cur_t_path[0],) + new_path - return Path(new_t) + return Path(TType((cur_t_path[0],) + new_path)) def __repr__(self): - return _format_path(_T_PATHS[self.path_t][1:]) + return _format_path(object.__getattribute__(self.path_t, '__op_args__')[1:]) def _format_path(t_path): @@ -987,9 +992,13 @@ def _trace(self, target, spec, scope): scope[glom] = scope[Inspect] if self.echo: print('---') + # TODO: switch from scope[Path] to the Target-Spec format trace above + # ... but maybe be smart about only printing deltas instead of the whole + # thing print('path: ', scope[Path] + [spec]) print('target:', target) if self.breakpoint: + # TODO: real debugger here? self.breakpoint() try: ret = scope[Inspect](target, spec, scope) @@ -1399,9 +1408,17 @@ class TType(object): equivalent to accessing the ``__class__`` attribute. """ - __slots__ = ('__weakref__',) + __slots__ = ("__op_args__",) - def __getattr__(self, name): + def __init__(self, op_args=None): + if op_args is None: + op_args = (self,) # for T, etc roots + self.__op_args__ = op_args + assert op_args != () + + def __getattribute__(self, name): + if name in ("__", "__repr__"): + return object.__getattribute__(self, '__') if name.startswith('__'): raise AttributeError('T instances reserve dunder attributes.' ' To access the "{name}" attribute, use' @@ -1424,29 +1441,18 @@ def __(self, name): return _t_child(self, '.', '__' + name) def __repr__(self): - t_path = _T_PATHS[self] - return _format_t(t_path[1:], t_path[0]) - - def __getstate__(self): - t_path = _T_PATHS[self] - return tuple(({T: 'T', S: 'S', A: 'A'}[t_path[0]],) + t_path[1:]) - - def __setstate__(self, state): - _T_PATHS[self] = ({'T': T, 'S': S, 'A': A}[state[0]],) + state[1:] - - -_T_PATHS = weakref.WeakKeyDictionary() + op_args = object.__getattribute__(self, '__op_args__') + return _format_t(op_args[1:], op_args[0]) def _t_child(parent, operation, arg): - t = TType() - base = _T_PATHS[parent] - if base[0] is A and operation not in ('.', '[', 'P'): + op_args = object.__getattribute__(parent, '__op_args__') + if op_args[0] is A and operation not in ('.', '[', 'P'): # whitelist rather than blacklist assignment friendly operations # TODO: error type? raise BadSpec("operation not allowed on A assignment path") - _T_PATHS[t] = base + (operation, arg) - return t + + return TType(op_args + (operation, arg)) def _s_first_magic(scope, key, _t): @@ -1465,7 +1471,7 @@ def _s_first_magic(scope, key, _t): def _t_eval(target, _t, scope): - t_path = _T_PATHS[_t] + t_path = object.__getattribute__(_t, '__op_args__') i = 1 fetch_till = len(t_path) root = t_path[0] @@ -1547,10 +1553,6 @@ def _t_eval(target, _t, scope): S = TType() # like T, but means grab stuff from Scope, not Target A = TType() # like S, but shorthand to assign target to scope -_T_PATHS[T] = (T,) -_T_PATHS[S] = (S,) -_T_PATHS[A] = (A,) - UP = make_sentinel('UP') ROOT = make_sentinel('ROOT') @@ -1807,6 +1809,7 @@ class TargetRegistry(object): def __init__(self, register_default_types=True): self._op_type_map = {} self._op_type_tree = {} # see _register_fuzzy_type for details + self._type_cache = {} self._op_auto_map = OrderedDict() # op name to function that returns handler function @@ -1825,22 +1828,26 @@ def get_handler(self, op, obj, path=None, raise_exc=True): """ ret = False obj_type = type(obj) - type_map = self.get_type_map(op) - if type_map: - try: - ret = type_map[obj_type] - except KeyError: - type_tree = self._op_type_tree.get(op, {}) - closest = self._get_closest_type(obj, type_tree=type_tree) - if closest is None: - ret = False - else: - ret = type_map[closest] + cache_key = (obj_type, op) + if cache_key not in self._type_cache: + type_map = self.get_type_map(op) + if type_map: + try: + ret = type_map[obj_type] + except KeyError: + type_tree = self._op_type_tree.get(op, {}) + closest = self._get_closest_type(obj, type_tree=type_tree) + if closest is None: + ret = False + else: + ret = type_map[closest] - if ret is False and raise_exc: - raise UnregisteredTarget(op, obj_type, type_map=type_map, path=path) + if ret is False and raise_exc: + raise UnregisteredTarget(op, obj_type, type_map=type_map, path=path) - return ret + self._type_cache[cache_key] = ret + + return self._type_cache[cache_key] def get_type_map(self, op): try: @@ -1928,6 +1935,8 @@ def register(self, target_type, **kwargs): for op_name in new_op_map: self._register_fuzzy_type(op_name, target_type) + self._type_cache = {} # reset type cache + return def register_op(self, op_name, auto_func=None, exact=False): @@ -2119,21 +2128,23 @@ def _has_callable_glomit(obj): def _glom(target, spec, scope): parent = scope + pmap = parent.maps[0] scope = scope.new_child({ T: target, Spec: spec, UP: parent, CHILD_ERRORS: [], + MODE: pmap[MODE], }) - parent[LAST_CHILD_SCOPE] = scope + pmap[LAST_CHILD_SCOPE] = scope try: - if isinstance(spec, TType): # must go first, due to callability + if type(spec) is TType: # must go first, due to callability return _t_eval(target, spec, scope) elif _has_callable_glomit(spec): return spec.glomit(target, scope) - return scope[MODE](target, spec, scope) + return scope.maps[0][MODE](target, spec, scope) except Exception as e: scope.maps[1][CHILD_ERRORS].append(scope) scope.maps[0][CUR_ERROR] = e @@ -2147,6 +2158,8 @@ def _glom(target, spec, scope): def AUTO(target, spec, scope): + if type(spec) is str: # shortcut to make deep-get use case faster + return _t_eval(target, Path.from_text(spec).path_t, scope) if isinstance(spec, dict): return _handle_dict(target, spec, scope) elif isinstance(spec, list): diff --git a/glom/matching.py b/glom/matching.py index 9446fce..1228535 100644 --- a/glom/matching.py +++ b/glom/matching.py @@ -1003,9 +1003,6 @@ def glomit(self, target, scope): type(target).__name__)) if errs: - # TODO: due to the usage of basic path (not a Path - # object), the format can be a bit inconsistent here - # (e.g., 'a.b' and ['a', 'b']) raise CheckError(errs, self, scope[Path]) return ret diff --git a/glom/test/perf_report.py b/glom/test/perf_report.py new file mode 100644 index 0000000..6315fb5 --- /dev/null +++ b/glom/test/perf_report.py @@ -0,0 +1,87 @@ +""" +slow gloms that came up organically, used as performance metrics +""" +import time +import gc + +import attr + +from glom import glom, T + + + +STR_SPEC = [{ + 'id': ('id', str), + 'name': 'short_name', + 'external_id': 'external_id', + 'created_date': 'created_date', +}] + + +T_SPEC = [{ + 'id': (T.id, str), + 'name': T.short_name, + 'external_id': T.external_id, + 'created_date': T.created_date, +}] + + +def func(data): + return [{ + 'id': str(t.id), + 'name': t.short_name, + 'external_id': t.external_id, + 'created_date': t.created_date + } for t in data] + + +def setup_list_of_dict(num=100): + """ + a common use case is list-of-dicts object processing + to prepare internal objects for JSON serialization + """ + Obj = attr.make_class( + 'Obj', ['id', 'short_name', 'external_id', 'created_date']) + + data = [ + Obj(i, 'name' + str(i), 'external' + str(i), 'now') for i in range(num)] + + return data + + +def run(spec, data): + start = time.time() + glom(data, spec) + end = time.time() + print("{} us per object".format((end - start) / len(data) * 1e6)) + + +def ratio(spec, func, data): + glom_dur = [] + py_dur = [] + for i in range(10): + t1 = time.perf_counter_ns() + glom(data, spec) + t2 = time.perf_counter_ns() + func(data) + t3 = time.perf_counter_ns() + glom_dur.append(t2 - t1) + py_dur.append(t3 - t2) + + glom_avg = sum(sorted(glom_dur)[2:-2]) + py_avg = sum(sorted(py_dur)[2:-2]) + + return 1.0 * glom_avg / py_avg + + +if __name__ == "__main__": + import cProfile + data = setup_list_of_dict(100000) + run(STR_SPEC, data) + run(STR_SPEC, data) + print(ratio(STR_SPEC, func, setup_list_of_dict(1000))) + print(ratio(STR_SPEC, func, setup_list_of_dict(1000))) + + +# suggest using scalene to profile with: +# $ scalene glom/test/perf_report.py --profile-all --reduced-profile --cpu-only --outfile SCALENE-CPU.txt diff --git a/glom/test/test_target_types.py b/glom/test/test_target_types.py index 97c7c0f..01b2fdd 100644 --- a/glom/test/test_target_types.py +++ b/glom/test/test_target_types.py @@ -73,9 +73,9 @@ def test_types_bare(): with pytest.raises(UnregisteredTarget) as exc_info: glommer.glom({'test': [{'hi': 'hi'}]}, ('test', ['hi'])) # feel free to update the "(at ['test'])" part to improve path display - assert str(exc_info.value).find( + assert ( "target type 'list' not registered for 'iterate', " - "expected one of registered types: (dict) (at ['test'])") != -1 + "expected one of registered types: (dict)" in str(exc_info.value)) return