From acdeb3b2abe668691ccc8d34fa7678e86c9d7e13 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Mon, 27 Jan 2025 11:56:37 -0800 Subject: [PATCH 1/3] blue hook added --- .pre-commit-config.yaml | 10 +- pyproject.toml | 7 + quantdb/api.py | 754 ++++++++++++++++++---------------- quantdb/api_server.py | 4 +- quantdb/auth-config.py | 30 +- quantdb/config.py | 2 +- quantdb/exceptions.py | 6 +- quantdb/ingest.py | 683 ++++++++++++++++++------------ quantdb/main.py | 6 +- quantdb/mysql_app/__init__.py | 2 +- quantdb/router.py | 12 +- quantdb/utils.py | 24 +- test/test_api.py | 82 ++-- 13 files changed, 914 insertions(+), 708 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d1dd0c1..2dabc8c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,14 +7,14 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace exclude: docs/img/ - - repo: https://github.com/psf/black - rev: 23.3.0 - hooks: - - id: black - args: ["-l", "120"] - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: - id: isort name: isort (python) args: ["-m", "3", "--tc"] + - repo: https://github.com/grantjenks/blue + rev: "v0.9.1" + hooks: + - id: blue + args: ["-l", "120"] diff --git a/pyproject.toml b/pyproject.toml index 40438fe..a3acead 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "orthauth ~= 0.0.18", "cython", "psycopg2-binary", + "pre-commit ~= 4.1.0" ] name = "quantdb" version = "0.1.0" @@ -22,6 +23,12 @@ readme = "README.md" packages = [{ include = "quantdb" }] requires-python = ">=3.10" +[tool.pre-commit] +repos = [ + { repo = "https://github.com/pre-commit/pre-commit-hooks", rev = "v4.4.0", hooks = [{ id = "trailing-whitespace" }, { id = "end-of-file-fixer" }] }, + { repo = "https://github.com/grantjenks/blue", rev = "0.9.1", hooks = [{ id = "blue" }] }, + { repo = "https://github.com/PyCQA/flake8", rev = "6.1.0", hooks = [{ id = "flake8" }] } +] # [project.scripts] # quantdb-fastapi = "uvicorn quantdb.main:app --reload" diff --git a/quantdb/api.py b/quantdb/api.py index 246422b..44d69fb 100644 --- a/quantdb/api.py +++ b/quantdb/api.py @@ -11,7 +11,7 @@ from quantdb.config import auth from quantdb.utils import dbUri, isoformat, log -log = log.getChild("api") +log = log.getChild('api') class JEncode(json.JSONEncoder): @@ -29,28 +29,38 @@ def default(self, obj): url_sql_where = ( # TODO arity spec here # dupes overwrite params but that is ok, this way we get the correct table alias for both cases - ('object', 'object', 'cv.object = any(:object)', 'cat'), # XXX should not use this outside values/ unless we left outer due to intersect ? - ('object', 'object', 'qv.object = any(:object)', 'quant'), # XXX should not use this outside values/ unless we left outer due to intersect ? - + ( + 'object', + 'object', + 'cv.object = any(:object)', + 'cat', + ), # XXX should not use this outside values/ unless we left outer due to intersect ? + ( + 'object', + 'object', + 'qv.object = any(:object)', + 'quant', + ), # XXX should not use this outside values/ unless we left outer due to intersect ? ('desc-inst', 'desc_inst', 'idin.label = any(:desc_inst)', 'both'), ('dataset', 'dataset', 'im.dataset = :dataset', 'both'), ('inst', 'inst', 'im.id_formal = any(:inst)', 'both'), ('inst-parent', 'inst_parent', 'icin.id_formal = any(:inst_parent)', 'both'), ('subject', 'subject', 'im.id_sub = any(:subject)', 'both'), ('sample', 'sample', 'im.id_sam = any(:sample)', 'both'), - ('desc-cat', 'desc_cat', 'cd.label = any(:desc_cat)', 'cat'), - ('value-cat', 'value_cat', 'ct.label = any(:value_cat)', 'cat'), ('value-cat-open', 'value_cat_open', 'cv.value_open = any(:value_cat_open)', 'cat'), - ('unit', 'unit', 'u.label = any(:unit)', 'quant'), ('aspect', 'aspect', 'ain.label = any(:aspect)', 'quant'), ('agg-type', 'agg_type', 'qd.aggregation_type = :agg_type', 'quant'), # TODO shape - ('value-quant', 'value_quant', 'qv.value = :value_quant', 'quant'), - ('value-quant-margin', 'value_quant_margin', 'qv.value <= :value_quant + :value_quant_margin AND qv.value >= :value_quant - :value_quant_margin', 'quant'), + ( + 'value-quant-margin', + 'value_quant_margin', + 'qv.value <= :value_quant + :value_quant_margin AND qv.value >= :value_quant - :value_quant_margin', + 'quant', + ), ('value-quant-min', 'value_quant_min', 'qv.value >= :value_quant_min', 'quant'), ('value-quant-max', 'value_quant_max', 'qv.value <= :value_quant_max', 'quant'), ) @@ -63,13 +73,11 @@ def get_where(kwargs): for u, s, w, t in url_sql_where: if u in kwargs and kwargs[u]: params[s] = kwargs[u] - if t == "cat": + if t == 'cat': _where_cat.append(w) elif t == 'quant': # do not include value-quant if value-quant-margin is provided - if (u == 'value-quant' and - 'value-quant-margin' in kwargs and - kwargs['value-quant-margin']): + if u == 'value-quant' and 'value-quant-margin' in kwargs and kwargs['value-quant-margin']: continue else: _where_quant.append(w) @@ -77,23 +85,23 @@ def get_where(kwargs): _where_cat.append(w) _where_quant.append(w) else: - raise ValueError("wat") + raise ValueError('wat') - where_cat = " AND ".join(_where_cat) - where_quant = " AND ".join(_where_quant) - log.log(9, f"\nwhere-quant\n{where_quant}\nwhere-quant") + where_cat = ' AND '.join(_where_cat) + where_quant = ' AND '.join(_where_quant) + log.log(9, f'\nwhere-quant\n{where_quant}\nwhere-quant') return where_cat, where_quant, params def main_query(endpoint, kwargs): ep_select = { #'instances': 'im.dataset, im.id_formal, im.id_sam, im.id_sub, id.label', - "values/inst": ( - "im.dataset, " - "im.id_formal AS inst, " - "im.id_sam AS sample, " - "im.id_sub AS subject, " - "id.label AS desc_inst" + 'values/inst': ( + 'im.dataset, ' + 'im.id_formal AS inst, ' + 'im.id_sam AS sample, ' + 'im.id_sub AS subject, ' + 'id.label AS desc_inst' ), 'objects': ( # TODO probably some path metadata file type, etc. too 'im.dataset, ' @@ -102,64 +110,64 @@ def main_query(endpoint, kwargs): 'o.id_file, ' # beware that there might be more than one id_file if a package is multi-file, but we usually ban those 'oi.updated_transitive' ), - "values/cat": ( - "im.dataset, " - "im.id_formal AS inst, " - "id.label AS desc_inst, " - "cdid.label AS domain, " - "cd.range, " - "cd.label AS desc_cat, " - "cv.value_open, " - "ct.label AS value_controlled" # TODO and where did it come from TODO iri + 'values/cat': ( + 'im.dataset, ' + 'im.id_formal AS inst, ' + 'id.label AS desc_inst, ' + 'cdid.label AS domain, ' + 'cd.range, ' + 'cd.label AS desc_cat, ' + 'cv.value_open, ' + 'ct.label AS value_controlled' # TODO and where did it come from TODO iri ), # TODO will want/need to return the shape of the value for these as well since that will be needed to correctly interpret the contents of the value field in the future - "values/quant": ( - "im.dataset, " - "im.id_formal AS inst, " - "id.label AS desc_inst, " - "qd.aggregation_type AS agg_type, " - "a.label AS aspect, " - "u.label AS unit, qv.value" # TODO and where did it come from + 'values/quant': ( + 'im.dataset, ' + 'im.id_formal AS inst, ' + 'id.label AS desc_inst, ' + 'qd.aggregation_type AS agg_type, ' + 'a.label AS aspect, ' + 'u.label AS unit, qv.value' # TODO and where did it come from ), - "values/cat-quant": ( + 'values/cat-quant': ( ( "'value-cat' AS type, " - "im.dataset, " - "im.id_formal AS inst, " - "id.label AS desc_inst, " - "cdid.label AS domain, " - "cd.range, " - "NULL::quant_agg_type as agg_type, " # have to annoate the nulls because distinct causes type inference to fail ??? - "cd.label AS pred_or_asp, " - "cv.value_open AS vo_or_unit, " - "ct.label AS value_controlled, " - "NULL::numeric AS value" + 'im.dataset, ' + 'im.id_formal AS inst, ' + 'id.label AS desc_inst, ' + 'cdid.label AS domain, ' + 'cd.range, ' + 'NULL::quant_agg_type as agg_type, ' # have to annoate the nulls because distinct causes type inference to fail ??? + 'cd.label AS pred_or_asp, ' + 'cv.value_open AS vo_or_unit, ' + 'ct.label AS value_controlled, ' + 'NULL::numeric AS value' ), ( "'value-quant' AS type, im.dataset, " - "im.id_formal AS inst, id.label AS desc_inst, " - "NULL AS domain, " - "NULL::cat_range_type AS range, " - "qd.aggregation_type AS agg_type, " - "a.label AS aspect, " - "u.label AS unit, " - "NULL AS vc, qv.value" + 'im.id_formal AS inst, id.label AS desc_inst, ' + 'NULL AS domain, ' + 'NULL::cat_range_type AS range, ' + 'qd.aggregation_type AS agg_type, ' + 'a.label AS aspect, ' + 'u.label AS unit, ' + 'NULL AS vc, qv.value' ), ), - "desc/inst": ("id.iri, " "id.label, " "idpar.label as subclassof "), - "desc/cat": ("cd.label, " "cdid.label AS domain, " "cd.range, " "cd.description "), - "desc/quant": ( - "qd.label, " - "id.label AS domain, " - "qd.shape, " - "qd.aggregation_type as agg_type, " - "a.label AS aspect, " - "u.label AS unit, " - "qd.description " + 'desc/inst': ('id.iri, ' 'id.label, ' 'idpar.label as subclassof '), + 'desc/cat': ('cd.label, ' 'cdid.label AS domain, ' 'cd.range, ' 'cd.description '), + 'desc/quant': ( + 'qd.label, ' + 'id.label AS domain, ' + 'qd.shape, ' + 'qd.aggregation_type as agg_type, ' + 'a.label AS aspect, ' + 'u.label AS unit, ' + 'qd.description ' ), - "terms": ("ct.iri, " "ct.label "), - "units": ("u.iri, " "u.label "), - "aspects": ("a.iri, " "a.label, " "aspar.label as subclassof "), + 'terms': ('ct.iri, ' 'ct.label '), + 'units': ('u.iri, ' 'u.label '), + 'aspects': ('a.iri, ' 'a.label, ' 'aspar.label as subclassof '), }[endpoint] # FIXME move extra and select out and pass then in in as arguments ? or retain control here? @@ -167,21 +175,21 @@ def gkw(k): return k in kwargs and kwargs[k] class sn: # select needs - objects = endpoint == "objects" + objects = endpoint == 'objects' desc_inst = endpoint not in ( - "objects", - "terms", - "units", - "aspects", + 'objects', + 'terms', + 'units', + 'aspects', ) - desc_cat = endpoint in ("values/cat", "values/cat-quant", "desc/cat") - value_cat = endpoint in ("values/cat", "values/cat-quant", "terms") - aspect = endpoint in ("values/quant", "values/cat-quant", "desc/quant", "aspects") - unit = endpoint in ("values/quant", "values/cat-quant", "desc/quant", "units") - agg_type = endpoint in ("values/quant", "values/cat-quant") + desc_cat = endpoint in ('values/cat', 'values/cat-quant', 'desc/cat') + value_cat = endpoint in ('values/cat', 'values/cat-quant', 'terms') + aspect = endpoint in ('values/quant', 'values/cat-quant', 'desc/quant', 'aspects') + unit = endpoint in ('values/quant', 'values/cat-quant', 'desc/quant', 'units') + agg_type = endpoint in ('values/quant', 'values/cat-quant') desc_quant = aspect or unit or agg_type - parent_aspect = endpoint == "aspects" - parent_desc_inst = endpoint == "desc/inst" + parent_aspect = endpoint == 'aspects' + parent_desc_inst = endpoint == 'desc/inst' class kw: # keywords prov = gkw('prov') @@ -193,7 +201,7 @@ class kw: # keywords aspect = gkw('aspect') unit = gkw('unit') agg_type = gkw('agg-type') - desc_quant = (aspect or unit or agg_type) + desc_quant = aspect or unit or agg_type q_par_desc_inst = """ JOIN descriptors_inst AS idstart ON idstart.id = {join_to}.desc_inst @@ -243,8 +251,8 @@ class kw: # keywords NULL as prov_aspect_addr_field, NULL::field_value_type as prov_aspect_type """ - if sn.unit or endpoint == "values/inst" - else "" + if sn.unit or endpoint == 'values/inst' + else '' ) s_prov_q = """ @@ -280,157 +288,205 @@ class kw: # keywords """ maybe_distinct = ( - "DISTINCT " + 'DISTINCT ' if ( - endpoint.startswith("desc/") - or endpoint in ("terms", "units", "aspects") + endpoint.startswith('desc/') + or endpoint in ('terms', 'units', 'aspects') or (sn.objects or kw.prov) and not kw.source_only ) - else "" + else '' ) ep_select_cat, ep_select_quant = ep_select if isinstance(ep_select, tuple) else (ep_select, ep_select) - select_cat = f"SELECT {maybe_distinct}{ep_select_cat}" + ( - (s_prov_objs + s_prov_i + ((",\n" + s_prov_c) if endpoint != "values/inst" else "")) if kw.prov else "" + select_cat = f'SELECT {maybe_distinct}{ep_select_cat}' + ( + (s_prov_objs + s_prov_i + ((',\n' + s_prov_c) if endpoint != 'values/inst' else '')) if kw.prov else '' ) - select_quant = f"SELECT {maybe_distinct}{ep_select_quant}" + ( - (s_prov_objs + s_prov_i + ((",\n" + s_prov_q) if endpoint != "values/inst" else "")) if kw.prov else "" + select_quant = f'SELECT {maybe_distinct}{ep_select_quant}' + ( + (s_prov_objs + s_prov_i + ((',\n' + s_prov_q) if endpoint != 'values/inst' else '')) if kw.prov else '' ) _where_cat, _where_quant, params = get_where(kwargs) - where_cat = f"WHERE {_where_cat}" if _where_cat else "" - where_quant = f"WHERE {_where_quant}" if _where_quant else "" + where_cat = f'WHERE {_where_cat}' if _where_cat else '' + where_quant = f'WHERE {_where_quant}' if _where_quant else '' - q_inst_parent = '\n'.join(( - 'JOIN values_inst AS icin', - 'CROSS JOIN LATERAL get_child_closed_inst(icin.id) AS ic ON im.id = ic.child', - )) if kw.parent_inst else '' + q_inst_parent = ( + '\n'.join( + ( + 'JOIN values_inst AS icin', + 'CROSS JOIN LATERAL get_child_closed_inst(icin.id) AS ic ON im.id = ic.child', + ) + ) + if kw.parent_inst + else '' + ) # FIXME even trying to be smart here about which joins to pull just papers over the underlying perf issue # shaves about 140ms off but the underlying issue remains - q_cat = '\n'.join(( - 'FROM values_cat AS cv', - '\n'.join(( - 'JOIN descriptors_inst AS idin', - 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON cv.desc_inst = idc.child -- FIXME', - )) if kw.desc_inst else '', - (q_par_desc_inst.format(join_to='cv') if sn.parent_desc_inst else - 'JOIN descriptors_inst AS id ON cv.desc_inst = id.id' - ) if sn.desc_inst or kw.desc_inst else '', # FIXME handle parents case - 'JOIN values_inst AS im ON cv.instance = im.id', - q_inst_parent, - '\n'.join(( - 'JOIN descriptors_cat AS cd ON cv.desc_cat = cd.id', - 'LEFT OUTER JOIN descriptors_inst AS cdid ON cd.domain = cdid.id -- XXX TODO mismach', - )) if sn.desc_cat or kw.desc_cat else '', - 'LEFT OUTER JOIN controlled_terms AS ct ON cv.value_controlled = ct.id' if sn.value_cat or kw.value_cat else '', - (('\n' - 'JOIN objects AS o ON cv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi\n' - 'ON oi.id = o.id\n') - if kw.source_only else - ('\n' # have to use LEFT OUTER because object might have only one of cat or quant - 'LEFT OUTER JOIN values_quant AS qv ON qv.instance = im.id\n' - 'JOIN objects AS o ON cv.object = o.id OR qv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi\n' - 'ON oi.id = o.id\n') - ) if sn.objects or kw.prov else '', - (q_prov_i + q_prov_c) if kw.prov else '', - )) - - q_quant = '\n'.join(( - 'FROM values_quant AS qv', - '\n'.join(( - 'JOIN descriptors_inst AS idin', - 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON qv.desc_inst = idc.child -- FIXME', - )) if kw.desc_inst else '', - (q_par_desc_inst.format(join_to='qv') if sn.parent_desc_inst else - 'JOIN descriptors_inst AS id ON qv.desc_inst = id.id' - ) if sn.desc_inst or kw.desc_inst else '', # FIXME handle parents case - 'JOIN values_inst AS im ON qv.instance = im.id', - q_inst_parent, - 'JOIN descriptors_quant AS qd ON qv.desc_quant = qd.id' if ( - sn.desc_quant or kw.desc_quant) else '', - '\n'.join(( - 'JOIN aspects AS ain', - 'CROSS JOIN LATERAL get_child_closed_aspect(ain.id) AS ac ON qd.aspect = ac.child', - 'JOIN aspects AS a ON ac.child = a.id', - )) if kw.aspect else ( - (q_par_aspect if sn.parent_aspect else - 'JOIN aspects AS a ON qd.aspect = a.id' - ) if sn.aspect else ''), # FIXME handle parents case - 'LEFT OUTER JOIN units AS u ON qd.unit = u.id' if sn.unit or kw.unit else '', - (('\n' - 'JOIN objects AS o ON qv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n') - if kw.source_only else - ('\n' # have to use LEFT OUTER because object might have only one of cat or quant - 'LEFT OUTER JOIN values_cat AS cv ON cv.instance = im.id\n' - 'JOIN objects AS o ON qv.object = o.id OR cv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n') - ) if sn.objects or kw.prov else '', - (q_prov_i + q_prov_q) if kw.prov else '', - )) - - sw_cat = f"{select_cat}\n{q_cat}\n{where_cat}" # XXX yes this can be malformed in some cases - sw_quant = f"{select_quant}\n{q_quant}\n{where_quant}" # XXX yes this can be malformed in some cases - if endpoint in ("values/cat", "terms", "desc/cat"): + q_cat = '\n'.join( + ( + 'FROM values_cat AS cv', + '\n'.join( + ( + 'JOIN descriptors_inst AS idin', + 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON cv.desc_inst = idc.child -- FIXME', + ) + ) + if kw.desc_inst + else '', + ( + q_par_desc_inst.format(join_to='cv') + if sn.parent_desc_inst + else 'JOIN descriptors_inst AS id ON cv.desc_inst = id.id' + ) + if sn.desc_inst or kw.desc_inst + else '', # FIXME handle parents case + 'JOIN values_inst AS im ON cv.instance = im.id', + q_inst_parent, + '\n'.join( + ( + 'JOIN descriptors_cat AS cd ON cv.desc_cat = cd.id', + 'LEFT OUTER JOIN descriptors_inst AS cdid ON cd.domain = cdid.id -- XXX TODO mismach', + ) + ) + if sn.desc_cat or kw.desc_cat + else '', + 'LEFT OUTER JOIN controlled_terms AS ct ON cv.value_controlled = ct.id' + if sn.value_cat or kw.value_cat + else '', + ( + ( + '\n' + 'JOIN objects AS o ON cv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi\n' + 'ON oi.id = o.id\n' + ) + if kw.source_only + else ( + '\n' # have to use LEFT OUTER because object might have only one of cat or quant + 'LEFT OUTER JOIN values_quant AS qv ON qv.instance = im.id\n' + 'JOIN objects AS o ON cv.object = o.id OR qv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi\n' + 'ON oi.id = o.id\n' + ) + ) + if sn.objects or kw.prov + else '', + (q_prov_i + q_prov_c) if kw.prov else '', + ) + ) + + q_quant = '\n'.join( + ( + 'FROM values_quant AS qv', + '\n'.join( + ( + 'JOIN descriptors_inst AS idin', + 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON qv.desc_inst = idc.child -- FIXME', + ) + ) + if kw.desc_inst + else '', + ( + q_par_desc_inst.format(join_to='qv') + if sn.parent_desc_inst + else 'JOIN descriptors_inst AS id ON qv.desc_inst = id.id' + ) + if sn.desc_inst or kw.desc_inst + else '', # FIXME handle parents case + 'JOIN values_inst AS im ON qv.instance = im.id', + q_inst_parent, + 'JOIN descriptors_quant AS qd ON qv.desc_quant = qd.id' if (sn.desc_quant or kw.desc_quant) else '', + '\n'.join( + ( + 'JOIN aspects AS ain', + 'CROSS JOIN LATERAL get_child_closed_aspect(ain.id) AS ac ON qd.aspect = ac.child', + 'JOIN aspects AS a ON ac.child = a.id', + ) + ) + if kw.aspect + else ( + (q_par_aspect if sn.parent_aspect else 'JOIN aspects AS a ON qd.aspect = a.id') if sn.aspect else '' + ), # FIXME handle parents case + 'LEFT OUTER JOIN units AS u ON qd.unit = u.id' if sn.unit or kw.unit else '', + ( + ( + '\n' + 'JOIN objects AS o ON qv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n' + ) + if kw.source_only + else ( + '\n' # have to use LEFT OUTER because object might have only one of cat or quant + 'LEFT OUTER JOIN values_cat AS cv ON cv.instance = im.id\n' + 'JOIN objects AS o ON qv.object = o.id OR cv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n' + ) + ) + if sn.objects or kw.prov + else '', + (q_prov_i + q_prov_q) if kw.prov else '', + ) + ) + + sw_cat = f'{select_cat}\n{q_cat}\n{where_cat}' # XXX yes this can be malformed in some cases + sw_quant = f'{select_quant}\n{q_quant}\n{where_quant}' # XXX yes this can be malformed in some cases + if endpoint in ('values/cat', 'terms', 'desc/cat'): query = sw_cat elif endpoint in ( - "values/quant", - "units", - "aspects", - "desc/quant", + 'values/quant', + 'units', + 'aspects', + 'desc/quant', ): # FIXME TODO make it possible to cross query terms, units, aspects query = sw_quant else: - operator = "UNION" if "union-cat-quant" in kwargs and kwargs["union-cat-quant"] else "INTERSECT" - query = f"{sw_cat}\n{operator}\n{sw_quant}" + operator = 'UNION' if 'union-cat-quant' in kwargs and kwargs['union-cat-quant'] else 'INTERSECT' + query = f'{sw_cat}\n{operator}\n{sw_quant}' - log.log(9, "\n" + query) + log.log(9, '\n' + query) return query, params def to_json(record_type, res, prov=False): rows = list(res) if rows: - if record_type == "object": + if record_type == 'object': result = [ ( - {k: v for k, v in r._asdict().items() if k != "id"} + {k: v for k, v in r._asdict().items() if k != 'id'} # do not leak internal ids because the might change and are not meaningful - if r.id_type == "quantdb" - else {k: v for k, v in r._asdict().items() if k != "updated_transitive"} + if r.id_type == 'quantdb' + else {k: v for k, v in r._asdict().items() if k != 'updated_transitive'} ) for r in rows ] - elif record_type is None and "type" in rows[0]._fields: - rem_cat = "value", "agg_type" + elif record_type is None and 'type' in rows[0]._fields: + rem_cat = 'value', 'agg_type' def type_fields_cat(k): - if k == "pred_or_asp": - return "desc_cat" - elif k == "vo_or_unit": - return "value_open" + if k == 'pred_or_asp': + return 'desc_cat' + elif k == 'vo_or_unit': + return 'value_open' else: return k - rem_quant = "domain", "range", "value_controlled" + rem_quant = 'domain', 'range', 'value_controlled' def type_fields_quant(k): - if k == "pred_or_asp": - return "aspect" - elif k == "vo_or_unit": - return "unit" + if k == 'pred_or_asp': + return 'aspect' + elif k == 'vo_or_unit': + return 'unit' else: return k def prow(r): - if r.type == "value-cat": + if r.type == 'value-cat': rem, type_fields = rem_cat, type_fields_cat - elif r.type == "value-quant": + elif r.type == 'value-quant': rem, type_fields = rem_quant, type_fields_quant else: - raise NotImplementedError(f"wat {r.type}") + raise NotImplementedError(f'wat {r.type}') return {type_fields(k): v for k, v in r._asdict().items() if k not in rem} @@ -440,37 +496,37 @@ def prow(r): for r in result: if record_type is not None: - r["type"] = record_type + r['type'] = record_type - for cull_none in ("subclassof",): + for cull_none in ('subclassof',): if cull_none in r and r[cull_none] is None: r.pop(cull_none) if prov: def pop_prefix(d, prefix): - usc = prefix.count("_") + usc = prefix.count('_') return { - k.split("_", 1 + usc)[-1]: v + k.split('_', 1 + usc)[-1]: v for k in list(d) - if k.startswith(prefix + "_") and (v := d.pop(k)) is not None + if k.startswith(prefix + '_') and (v := d.pop(k)) is not None } for r in result: - provs = pop_prefix(r, "prov") - if "source_id_type" in provs and provs["source_id_type"] == "quantdb": - provs.pop("source_id", None) # don't leak internal ids + provs = pop_prefix(r, 'prov') + if 'source_id_type' in provs and provs['source_id_type'] == 'quantdb': + provs.pop('source_id', None) # don't leak internal ids else: - provs.pop("source_updated_transitive", None) # always None in this case + provs.pop('source_updated_transitive', None) # always None in this case - for prefix in ("desc_inst", "inst", "value", "value", "source"): + for prefix in ('desc_inst', 'inst', 'value', 'value', 'source'): d = pop_prefix(provs, prefix) if d: - d["type"] = "address" if prefix != "source" else "object" + d['type'] = 'address' if prefix != 'source' else 'object' provs[prefix] = d - provs["type"] = "prov" - r["prov"] = provs + provs['type'] = 'prov' + r['prov'] = provs out = result # breakpoint() @@ -488,20 +544,20 @@ def wrap_out(endpoint, kwargs, out): parameters = {k: v for k, v in kwargs.items() if v} n_records = len(out) blob = { - "type": "quantdb-query-result", - "endpoint": endpoint, - "parameters": parameters, - "records": n_records, - "result": out, + 'type': 'quantdb-query-result', + 'endpoint': endpoint, + 'parameters': parameters, + 'records': n_records, + 'result': out, } return blob args_default = { - "object": [], - "updated-transitive": None, # TODO needed to query for some internal + 'object': [], + 'updated-transitive': None, # TODO needed to query for some internal ## inst - "desc-inst": [], # aka class + 'desc-inst': [], # aka class # value-inst 'dataset': None, 'inst': [], @@ -509,27 +565,26 @@ def wrap_out(endpoint, kwargs, out): 'subject': [], 'sample': [], 'include-equivalent': False, - ## cat - "desc-cat": [], # aka predicate - "value-cat": [], - "value-cat-open": [], + 'desc-cat': [], # aka predicate + 'value-cat': [], + 'value-cat-open': [], ## quant # desc-quant - "unit": [], - "aspect": [], - "agg-type": None, + 'unit': [], + 'aspect': [], + 'agg-type': None, # TODO shape - "value-quant": None, - "value-quant-margin": None, - "value-quant-min": None, - "value-quant-max": None, - "limit": 100, + 'value-quant': None, + 'value-quant-margin': None, + 'value-quant-min': None, + 'value-quant-max': None, + 'limit': 100, #'operator': 'INTERSECT', # XXX ... - "union-cat-quant": False, # by default we intersect but sometimes you want the union instead e.g. if object is passed - "source-only": False, - "include-unused": False, - "prov": False, + 'union-cat-quant': False, # by default we intersect but sometimes you want the union instead e.g. if object is passed + 'source-only': False, + 'include-unused': False, + 'prov': False, #'cat-value': [], #'class': [], #'predicate': None, @@ -546,33 +601,33 @@ def getArgs(request, endpoint, dev=False): default = copy.deepcopy(args_default) if dev: - default["return-query"] = False + default['return-query'] = False # modify defaults by endpoint - if endpoint != "objects": - default.pop("source-only") + if endpoint != 'objects': + default.pop('source-only') - if not (endpoint.startswith("desc/") or endpoint in ("terms", "units", "aspects")): - default.pop("include-unused") + if not (endpoint.startswith('desc/') or endpoint in ('terms', 'units', 'aspects')): + default.pop('include-unused') else: # prevent filtering on the thing we are trying to query - if endpoint == "terms": - default.pop("value-cat") - elif endpoint == "units": - default.pop("unit") - elif endpoint == "aspects": - default.pop("aspect") - elif endpoint == "desc/inst": - default.pop("desc-inst") - elif endpoint == "desc/cat": - default.pop("desc-cat") - - if not endpoint.startswith("values/"): - default.pop("prov") - elif endpoint == "values/cat": - [default.pop(k) for k in list(default) if k.startswith("value-quant") or k in ("unit", "aspect", "agg-type")] - elif endpoint == "values/quant": - [default.pop(k) for k in list(default) if k in ("desc-cat", "value-cat", "value-cat-open")] + if endpoint == 'terms': + default.pop('value-cat') + elif endpoint == 'units': + default.pop('unit') + elif endpoint == 'aspects': + default.pop('aspect') + elif endpoint == 'desc/inst': + default.pop('desc-inst') + elif endpoint == 'desc/cat': + default.pop('desc-cat') + + if not endpoint.startswith('values/'): + default.pop('prov') + elif endpoint == 'values/cat': + [default.pop(k) for k in list(default) if k.startswith('value-quant') or k in ('unit', 'aspect', 'agg-type')] + elif endpoint == 'values/quant': + [default.pop(k) for k in list(default) if k in ('desc-cat', 'value-cat', 'value-cat-open')] if (endpoint == 'values/inst') or (endpoint == 'objects'): # prevent getting no results if only cat or quant @@ -584,13 +639,15 @@ def getArgs(request, endpoint, dev=False): extras = set(request.args) - set(default) if extras: # FIXME raise this as a 401, TODO need error types for this - nl = "\n" - raise exc.UnknownArg(f"unknown args: {nl.join(extras)}") + nl = '\n' + raise exc.UnknownArg(f'unknown args: {nl.join(extras)}') def convert(k, d): if k in request.args: # arity is determined here - if k in ('dataset', 'include-equivalent', 'union-cat-quant', 'include-unused', 'agg-type') or k.startswith('value-quant'): + if k in ('dataset', 'include-equivalent', 'union-cat-quant', 'include-unused', 'agg-type') or k.startswith( + 'value-quant' + ): v = request.args[k] if k in ('dataset',): if not v: @@ -620,14 +677,14 @@ def convert(k, d): else: return d - if k in ("include-equivalent", "union-cat-quant", "include-unused"): - if v.lower() == "true": + if k in ('include-equivalent', 'union-cat-quant', 'include-unused'): + if v.lower() == 'true': return True - elif v.lower() == "false": + elif v.lower() == 'false': return False else: raise TypeError(f'Expected a bool, got "{v}" instead.') - elif k.startswith("value-quant") or k in ("limit",): + elif k.startswith('value-quant') or k in ('limit',): try: return float(v) except ValueError as e: @@ -639,16 +696,16 @@ def convert(k, d): return out -def make_app(db=None, name="quantdb-api-server", dev=False): +def make_app(db=None, name='quantdb-api-server', dev=False): app = Flask(name) - kwargs = {k: auth.get(f"db-{k}") for k in ("user", "host", "port", "database")} # TODO integrate with cli options - kwargs["dbuser"] = kwargs.pop("user") - app.config["SQLALCHEMY_DATABASE_URI"] = dbUri(**kwargs) # use os.environ.update - app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False + kwargs = {k: auth.get(f'db-{k}') for k in ('user', 'host', 'port', 'database')} # TODO integrate with cli options + kwargs['dbuser'] = kwargs.pop('user') + app.config['SQLALCHEMY_DATABASE_URI'] = dbUri(**kwargs) # use os.environ.update + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db.init_app(app) session = db.session - bp = "/api/1/" + bp = '/api/1/' def default_flow(endpoint, record_type, query_fun, json_fun, alt_query_fun=None): try: @@ -662,7 +719,7 @@ def default_flow(endpoint, record_type, query_fun, json_fun, alt_query_fun=None) def gkw(k): return k in kwargs and kwargs[k] - if gkw("include-unused"): + if gkw('include-unused'): query_fun = alt_query_fun # FIXME record_type is actually determined entirely in query_fun right now @@ -673,19 +730,24 @@ def gkw(k): raise e if gkw('return-query'): - #from psycopg2cffi._impl.cursor import _combine_cmd_params # this was an absolute pita to track down - #stq = sql_text(query) - #stq = stq.bindparams(**params) - #conn = session.connection() - #cur = conn.engine.raw_connection().cursor() - #cq, cp, _ = stq._compile_w_cache(dialect=conn.dialect, compiled_cache=conn.engine._compiled_cache, column_keys=sorted(params)) - #almost = str(stq.compile(dialect=conn.dialect,)) #compile_kwargs={'literal_binds': True}, - #wat = _combine_cmd_params(str(cq), params, cur.connection) + # from psycopg2cffi._impl.cursor import _combine_cmd_params # this was an absolute pita to track down + # stq = sql_text(query) + # stq = stq.bindparams(**params) + # conn = session.connection() + # cur = conn.engine.raw_connection().cursor() + # cq, cp, _ = stq._compile_w_cache(dialect=conn.dialect, compiled_cache=conn.engine._compiled_cache, column_keys=sorted(params)) + # almost = str(stq.compile(dialect=conn.dialect,)) #compile_kwargs={'literal_binds': True}, + # wat = _combine_cmd_params(str(cq), params, cur.connection) ord_params = {k: v for k, v in sorted(params.items())} ARRAY = 'ARRAY' ccuuid = '::uuid' - org_vars = ' '.join([f':var {key}="{ARRAY + repr(value) if isinstance(value, list) else (repr(str(value)) + ccuuid if isinstance(value, uuid.UUID) else repr(value))}"' for key, value in ord_params.items()]) - return f''' SQL query expansion for quantdb @@ -699,7 +761,7 @@ def gkw(k): {query} -''' +""" try: res = session.execute(sql_text(query), params) @@ -708,128 +770,126 @@ def gkw(k): raise e try: - out = json_fun(record_type, res, prov=("prov" in kwargs and kwargs["prov"])) - resp = json.dumps(wrap_out(endpoint, kwargs, out), cls=JEncode), 200, {"Content-Type": "application/json"} + out = json_fun(record_type, res, prov=('prov' in kwargs and kwargs['prov'])) + resp = json.dumps(wrap_out(endpoint, kwargs, out), cls=JEncode), 200, {'Content-Type': 'application/json'} except Exception as e: breakpoint() raise e return resp - @app.route(f"{bp}/test") + @app.route(f'{bp}/test') def route_test_check(): - "objects with derived values that match all criteria" - return "testing-api" + """objects with derived values that match all criteria""" + return 'testing-api' - @app.route(f"{bp}/objects") + @app.route(f'{bp}/objects') def route_1_objects(): - "objects with derived values that match all criteria" - return default_flow("objects", "object", main_query, to_json) + """objects with derived values that match all criteria""" + return default_flow('objects', 'object', main_query, to_json) - @app.route(f"{bp}/desc/inst") - @app.route(f"{bp}/descriptors/inst") - @app.route(f"{bp}/classes") + @app.route(f'{bp}/desc/inst') + @app.route(f'{bp}/descriptors/inst') + @app.route(f'{bp}/classes') def route_1_desc_inst(): def query(endpoint, kwargs): return ( - ( - "SELECT " - "id.iri, " - "id.label, " - "idpar.label as subclassof" - """ -FROM descriptors_inst AS id -LEFT OUTER JOIN class_parent AS clp ON clp.id = id.id -LEFT OUTER JOIN descriptors_inst AS idpar ON idpar.id = clp.parent -"""), {} + 'SELECT ' + 'id.iri, ' + 'id.label, ' + 'idpar.label as subclassof' + 'FROM descriptors_inst AS id' + 'LEFT OUTER JOIN class_parent AS clp ON clp.id = id.id' + 'LEFT OUTER JOIN descriptors_inst AS idpar ON idpar.id = clp.parent' + ), {} - return default_flow("desc/inst", "desc-inst", main_query, to_json, alt_query_fun=query) + return default_flow('desc/inst', 'desc-inst', main_query, to_json, alt_query_fun=query) - @app.route(f"{bp}/desc/cat") - @app.route(f"{bp}/descriptors/cat") - @app.route(f"{bp}/predicates") + @app.route(f'{bp}/desc/cat') + @app.route(f'{bp}/descriptors/cat') + @app.route(f'{bp}/predicates') def route_1_desc_cat(): def query(endpoint, kwargs): return ( - "select " - "cd.label, " - "cdid.label AS domain, " - "cd.range, " - "cd.description " - "from descriptors_cat as cd " - "left outer join descriptors_inst as cdid on cdid.id = cd.domain" + 'select ' + 'cd.label, ' + 'cdid.label AS domain, ' + 'cd.range, ' + 'cd.description ' + 'from descriptors_cat as cd ' + 'left outer join descriptors_inst as cdid on cdid.id = cd.domain' ), {} return default_flow( - "desc/cat", "desc-cat", main_query, to_json, alt_query_fun=query + 'desc/cat', 'desc-cat', main_query, to_json, alt_query_fun=query ) # TODO likely need different args e.g. to filter by desc_inst - @app.route(f"{bp}/desc/quant") - @app.route(f"{bp}/descriptors/quant") + @app.route(f'{bp}/desc/quant') + @app.route(f'{bp}/descriptors/quant') def route_1_desc_quant(): def query(endpoint, kwargs): return ( - "select " - "qd.label, " - "id.label AS domain, " - "qd.shape, " - "qd.aggregation_type as agg_type, " - "a.label AS aspect, " - "u.label AS unit, " - "qd.description " - "from descriptors_quant as qd " - "left outer join descriptors_inst as id on id.id = qd.domain " - "left outer join units as u on u.id = qd.unit " - "join aspects as a on a.id = qd.aspect" + 'select ' + 'qd.label, ' + 'id.label AS domain, ' + 'qd.shape, ' + 'qd.aggregation_type as agg_type, ' + 'a.label AS aspect, ' + 'u.label AS unit, ' + 'qd.description ' + 'from descriptors_quant as qd ' + 'left outer join descriptors_inst as id on id.id = qd.domain ' + 'left outer join units as u on u.id = qd.unit ' + 'join aspects as a on a.id = qd.aspect' ), {} return default_flow( - "desc/quant", "desc-quant", main_query, to_json, alt_query_fun=query + 'desc/quant', 'desc-quant', main_query, to_json, alt_query_fun=query ) # TODO likely need different args e.g. to filter by desc_inst - @app.route(f"{bp}/values/inst") - @app.route(f"{bp}/instances") + @app.route(f'{bp}/values/inst') + @app.route(f'{bp}/instances') def route_1_val_inst(): - "instances associated with values that match all critiera" - return default_flow("values/inst", "instance", main_query, to_json) + """instances associated with values that match all critiera""" + return default_flow('values/inst', 'instance', main_query, to_json) - @app.route(f"{bp}/values") - @app.route(f"{bp}/values/cat-quant") + @app.route(f'{bp}/values') + @app.route(f'{bp}/values/cat-quant') def route_1_val_cat_quant(): - return default_flow("values/cat-quant", None, main_query, to_json) + return default_flow('values/cat-quant', None, main_query, to_json) - @app.route(f"{bp}/values/cat") + @app.route(f'{bp}/values/cat') def route_1_val_cat(): - return default_flow("values/cat", "value-cat", main_query, to_json) + return default_flow('values/cat', 'value-cat', main_query, to_json) - @app.route(f"{bp}/values/quant") + @app.route(f'{bp}/values/quant') def route_1_val_quant(): - return default_flow("values/quant", "value-quant", main_query, to_json) + return default_flow('values/quant', 'value-quant', main_query, to_json) - @app.route(f"{bp}/terms") - @app.route(f"{bp}/controlled-terms") + @app.route(f'{bp}/terms') + @app.route(f'{bp}/controlled-terms') def route_1_cterms(): def query(endpoint, kwargs): - return ("select " "ct.iri, " "ct.label " "from controlled_terms as ct"), {} + return ('select ' 'ct.iri, ' 'ct.label ' 'from controlled_terms as ct'), {} - return default_flow("terms", "term", main_query, to_json, alt_query_fun=query) + return default_flow('terms', 'term', main_query, to_json, alt_query_fun=query) - @app.route(f"{bp}/units") + @app.route(f'{bp}/units') def route_1_units(): def query(endpoint, kwargs): - return ("select " "u.iri, " "u.label " "from units as u"), {} + return ('select ' 'u.iri, ' 'u.label ' 'from units as u'), {} - return default_flow("units", "unit", main_query, to_json, alt_query_fun=query) + return default_flow('units', 'unit', main_query, to_json, alt_query_fun=query) - @app.route(f"{bp}/aspects") + @app.route(f'{bp}/aspects') def route_1_aspects(): def query(endpoint, kwargs): return ( ( - "SELECT " - "a.iri, " - "a.label, " - "aspar.label AS subclassof " + 'SELECT ' + 'a.iri, ' + 'a.label, ' + 'aspar.label AS subclassof ' """ FROM aspects AS a LEFT OUTER JOIN aspect_parent AS ap ON ap.id = a.id @@ -839,6 +899,6 @@ def query(endpoint, kwargs): {}, ) - return default_flow("aspects", "aspect", main_query, to_json, alt_query_fun=query) + return default_flow('aspects', 'aspect', main_query, to_json, alt_query_fun=query) return app diff --git a/quantdb/api_server.py b/quantdb/api_server.py index 0d5c8c3..0611dad 100644 --- a/quantdb/api_server.py +++ b/quantdb/api_server.py @@ -9,5 +9,5 @@ app = make_app(db=db) -if __name__ == "__main__": - app.run(host="localhost", port=8989, threaded=True) +if __name__ == '__main__': + app.run(host='localhost', port=8989, threaded=True) diff --git a/quantdb/auth-config.py b/quantdb/auth-config.py index 3b430a3..ebbedcb 100644 --- a/quantdb/auth-config.py +++ b/quantdb/auth-config.py @@ -1,27 +1,27 @@ { - "config-search-paths": [ - "{:user-config-path}/quantdb/config.yaml", + 'config-search-paths': [ + '{:user-config-path}/quantdb/config.yaml', ], - "auth-variables": { + 'auth-variables': { # test-db - "test-db-user": {"default": "quantdb-test-user", "environment-variables": "QUANTDB_TEST_DB_USER"}, - "test-db-host": {"default": "localhost", "environment-variables": "QUANTDB_TEST_DB_HOST"}, - "test-db-port": {"default": 5432, "environment-variables": "QUANTDB_TEST_DB_PORT"}, - "test-db-database": { + 'test-db-user': {'default': 'quantdb-test-user', 'environment-variables': 'QUANTDB_TEST_DB_USER'}, + 'test-db-host': {'default': 'localhost', 'environment-variables': 'QUANTDB_TEST_DB_HOST'}, + 'test-db-port': {'default': 5432, 'environment-variables': 'QUANTDB_TEST_DB_PORT'}, + 'test-db-database': { # we DO set a default database for testing # so that it is present for reference - "default": "quantdb_test", - "environment-variables": "QUANTDB_TEST_DB_DATABASE QUANTDB_TEST_DATABASE", + 'default': 'quantdb_test', + 'environment-variables': 'QUANTDB_TEST_DB_DATABASE QUANTDB_TEST_DATABASE', }, # db - "db-user": {"default": "quantdb-user", "environment-variables": "QUANTDB_DB_USER"}, - "db-host": {"default": "localhost", "environment-variables": "QUANTDB_DB_HOST"}, - "db-port": {"default": 5432, "environment-variables": "QUANTDB_DB_PORT"}, - "db-database": { + 'db-user': {'default': 'quantdb-user', 'environment-variables': 'QUANTDB_DB_USER'}, + 'db-host': {'default': 'localhost', 'environment-variables': 'QUANTDB_DB_HOST'}, + 'db-port': {'default': 5432, 'environment-variables': 'QUANTDB_DB_PORT'}, + 'db-database': { # we don't set a default here to prevent # accidental operations on a default db - "default": None, - "environment-variables": "QUANTDB_DB_DATABASE QUANTDB_DATABASE", + 'default': None, + 'environment-variables': 'QUANTDB_DB_DATABASE QUANTDB_DATABASE', }, }, } diff --git a/quantdb/config.py b/quantdb/config.py index fbcd536..a50ec33 100644 --- a/quantdb/config.py +++ b/quantdb/config.py @@ -1,3 +1,3 @@ import orthauth as oa -auth = oa.configure_here("auth-config.py", __name__) +auth = oa.configure_here('auth-config.py', __name__) diff --git a/quantdb/exceptions.py b/quantdb/exceptions.py index 41998a2..a722742 100644 --- a/quantdb/exceptions.py +++ b/quantdb/exceptions.py @@ -3,12 +3,12 @@ class QuantdbError(Exception): class UnknownArg(QuantdbError): - """ url query parameter unknown """ + """url query parameter unknown""" class ArgMissingValue(QuantdbError): - """ url query parameter contained no value """ + """url query parameter contained no value""" class BadValue(QuantdbError): - """ url query parameter contained a malformed value """ + """url query parameter contained a malformed value""" diff --git a/quantdb/ingest.py b/quantdb/ingest.py index 8fba0fe..cb9bd41 100644 --- a/quantdb/ingest.py +++ b/quantdb/ingest.py @@ -5,19 +5,20 @@ import requests from sparcur import objects as sparcur_objects # register pathmeta type -# FIXME sparcur dependencies, or keep ingest separate -from sparcur.utils import fromJson, PennsieveId as RemoteId from sparcur.paths import Path -from sparcur import objects as sparcur_objects # register pathmeta type -from quantdb.utils import log, dbUri, isoformat +from sparcur.utils import PennsieveId as RemoteId +from sparcur.utils import fromJson + +from quantdb.utils import dbUri, isoformat, log +# FIXME sparcur.utils dependencies, or keep ingest separate ######### start database interaction section -log = log.getChild("ingest") +log = log.getChild('ingest') try: - if get_ipython().__class__.__name__ == "ZMQInteractiveShell": + if get_ipython().__class__.__name__ == 'ZMQInteractiveShell': import sys # FIXME hack that should be in jupyter-repl env or something sys.breakpointhook = lambda: None @@ -59,7 +60,7 @@ def __call__(self, value, type=None): return self.value_to_name[value] else: self.counter += 1 - name = "v" + str(self.counter) + name = 'v' + str(self.counter) if type is None: self.value_to_name[value] = name @@ -73,8 +74,8 @@ def __call__(self, value, type=None): def makeParamsValues(*value_sets, constants=tuple(), types=tuple(), row_types=tuple()): # TODO variable sized records and # common value names - if constants and not all(":" in c for c in constants): - raise ValueError(f"All constants must pass variables in via params {constants}") + if constants and not all(':' in c for c in constants): + raise ValueError(f'All constants must pass variables in via params {constants}') getname = getName() @@ -94,8 +95,8 @@ def makeParamsValues(*value_sets, constants=tuple(), types=tuple(), row_types=tu else: proto_params = [(tuple(getname(value) for value in row), row) for row in values] - values_template = ", ".join( - "(" + ", ".join(constants + tuple(":" + name for name in names)) + ")" for names, _ in proto_params + values_template = ', '.join( + '(' + ', '.join(constants + tuple(':' + name for name in names)) + ')' for names, _ in proto_params ) yield values_template if row_types: @@ -138,9 +139,9 @@ def makeParamsValues(*value_sets, constants=tuple(), types=tuple(), row_types=tu 'p': 2, # posterior } seg_ordering = { - "c": 0, # cervical - "t": 1, # thoracic - "a": 2, # abdominal + 'c': 0, # cervical + 't': 1, # thoracic + 'a': 2, # abdominal } @@ -148,7 +149,7 @@ def anat_index(sample): # count the number of distinct values less than a given integer # create the map - sam, sam_id, seg, seg_id = sample.split("-") + sam, sam_id, seg, seg_id = sample.split('-') # FIXME bad becase left and right are unstable and we don't care about this, we just want relative to max possible # don't do this with sort sam_ind = sam_ordering[sam_id] @@ -158,14 +159,14 @@ def anat_index(sample): seg_ind = v break else: - if sam_id == "c": + if sam_id == 'c': # print('c sample', sample) # rest = int(''.join(_ for _ in seg_id if _.isdigit())) rest = int(seg_id[:-1]) suffix = int(seg_id[-1].encode().hex()) return sam_ind, 0, rest, suffix else: - msg = f"unknown seg {sample}" + msg = f'unknown seg {sample}' print(msg) # FIXME TODO logging # raise ValueError(msg) # return int(f'{sam_ind}000') @@ -217,14 +218,16 @@ def pps123(path_structure): 'parents': (p1,), 'subject': subject, 'sample': sample, - } + } def ext_pmeta(j, _pps=pps): out = {} out['dataset'] = j['dataset_id'] out['object'] = j['remote_id'] - out['file_id'] = j['file_id'] if 'file_id' in j else int(j['uri_api'].rsplit('/')[-1]) # XXX old pathmeta schema that didn't include file id + out['file_id'] = ( + j['file_id'] if 'file_id' in j else int(j['uri_api'].rsplit('/')[-1]) + ) # XXX old pathmeta schema that didn't include file id ps = pathlib.Path(j['dataset_relative_path']).parts [p for p in ps if p.startswith('sub-') or p.startswith('sam-')] out.update(_pps(ps)) @@ -242,7 +245,12 @@ def __init__(self, session): def address_from_fadd_type_fadd(self, fadd_type, fadd): # FIXME multi etc. params = dict(fadd_type=fadd_type, fadd=fadd) - res = [i for i, in self.session.execute(sql_text("select * from address_from_fadd_type_fadd(:fadd_type, :fadd)"), params)] + res = [ + i + for i, in self.session.execute( + sql_text('select * from address_from_fadd_type_fadd(:fadd_type, :fadd)'), params + ) + ] if res: out = res[0] if out is None: @@ -253,7 +261,7 @@ def address_from_fadd_type_fadd(self, fadd_type, fadd): def desc_inst_from_label(self, label): # FIXME multi etc. params = dict(label=label) - res = [i for i, in self.session.execute(sql_text("select * from desc_inst_from_label(:label)"), params)] + res = [i for i, in self.session.execute(sql_text('select * from desc_inst_from_label(:label)'), params)] if res: out = res[0] if out is None: @@ -264,7 +272,7 @@ def desc_inst_from_label(self, label): def desc_quant_from_label(self, label): # FIXME multi etc. params = dict(label=label) - res = [i for i, in self.session.execute(sql_text("select * from desc_quant_from_label(:label)"), params)] + res = [i for i, in self.session.execute(sql_text('select * from desc_quant_from_label(:label)'), params)] if res: out = res[0] if out is None: @@ -275,8 +283,12 @@ def desc_quant_from_label(self, label): def desc_cat_from_label_domain_label(self, label, domain_label): # FIXME multi etc. params = dict(label=label, domain_label=domain_label) - res = [i for i, in self.session.execute(sql_text("select * from desc_cat_from_label_domain_label(:label, :domain_label)"), - params)] + res = [ + i + for i, in self.session.execute( + sql_text('select * from desc_cat_from_label_domain_label(:label, :domain_label)'), params + ) + ] if res: out = res[0] if out is None: @@ -287,7 +299,7 @@ def desc_cat_from_label_domain_label(self, label, domain_label): def cterm_from_label(self, label): # FIXME multi etc. params = dict(label=label) - res = [i for i, in self.session.execute(sql_text("select * from cterm_from_label(:label)"), params)] + res = [i for i, in self.session.execute(sql_text('select * from cterm_from_label(:label)'), params)] if res: out = res[0] if out is None: @@ -295,14 +307,13 @@ def cterm_from_label(self, label): else: return out - def insts_from_dataset(self, dataset): - return list(self.session.execute(sql_text("select * from insts_from_dataset(:dataset)"), dict(dataset=dataset))) + return list(self.session.execute(sql_text('select * from insts_from_dataset(:dataset)'), dict(dataset=dataset))) def insts_from_dataset_ids(self, dataset, ids): return list( self.session.execute( - sql_text("select * from insts_from_dataset_ids(:dataset, :ids)"), dict(dataset=dataset, ids=ids) + sql_text('select * from insts_from_dataset_ids(:dataset, :ids)'), dict(dataset=dataset, ids=ids) ) ) @@ -315,18 +326,24 @@ def __init__(self, queries): self.addr_tmod = q.address_from_fadd_type_fadd('tabular-header', 'modality') self.addr_NFasc = q.address_from_fadd_type_fadd('tabular-header', 'NFasc') # FIXME not really a tabular source - self.addr_dNerve_um = q.address_from_fadd_type_fadd('tabular-header', 'dNerve_um') # FIXME not really a tabular source + self.addr_dNerve_um = q.address_from_fadd_type_fadd( + 'tabular-header', 'dNerve_um' + ) # FIXME not really a tabular source self.addr_laterality = q.address_from_fadd_type_fadd('tabular-header', 'laterality') self.addr_level = q.address_from_fadd_type_fadd('tabular-header', 'level') - self.addr_dFasc_um_idx = q.address_from_fadd_type_fadd('json-path-with-types', '#/#int/dFasc_um') # FIXME not really a json source, FIXME how to distinguish the index from the value - self.addr_dFasc_um_value = q.address_from_fadd_type_fadd('json-path-with-types', '#/#int/dFasc_um/#int') # FIXME not really a json source + self.addr_dFasc_um_idx = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/#int/dFasc_um' + ) # FIXME not really a json source, FIXME how to distinguish the index from the value + self.addr_dFasc_um_value = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/#int/dFasc_um/#int' + ) # FIXME not really a json source - #addr_trai = address_from_fadd_type_fadd('tabular-header', 'raw_anat_index') - #addr_tnai = address_from_fadd_type_fadd('tabular-header', 'norm_anat_index') - #addr_context = address_from_fadd_type_fadd('context', '#/path-metadata/{index of match remote_id}/dataset_relative_path') # XXX this doesn't do what we want, I think what we really would want in these contexts are objects_internal that reference the file system state for a given updated snapshot, that is the real "object" that corresponds to the path-metadata.json that we are working from + # addr_trai = address_from_fadd_type_fadd('tabular-header', 'raw_anat_index') + # addr_tnai = address_from_fadd_type_fadd('tabular-header', 'norm_anat_index') + # addr_context = address_from_fadd_type_fadd('context', '#/path-metadata/{index of match remote_id}/dataset_relative_path') # XXX this doesn't do what we want, I think what we really would want in these contexts are objects_internal that reference the file system state for a given updated snapshot, that is the real "object" that corresponds to the path-metadata.json that we are working from - self.addr_tmod = q.address_from_fadd_type_fadd("tabular-header", "modality") + self.addr_tmod = q.address_from_fadd_type_fadd('tabular-header', 'modality') # addr_trai = address_from_fadd_type_fadd('tabular-header', 'raw_anat_index') # addr_tnai = address_from_fadd_type_fadd('tabular-header', 'norm_anat_index') # addr_context = address_from_fadd_type_fadd('context', '#/path-metadata/{index of match remote_id}/dataset_relative_path') # XXX this doesn't do what we want, I think what we really would want in these contexts are objects_internal that reference the file system state for a given updated snapshot, that is the real "object" that corresponds to the path-metadata.json that we are working from @@ -336,23 +353,41 @@ def __init__(self, queries): # addr_jpnai = address_from_fadd_type_fadd('json-path-with-types', '#/#int/norm_anat_index') self.addr_jpdrp = q.address_from_fadd_type_fadd( - "json-path-with-types", "#/path-metadata/data/#int/dataset_relative_path" + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path' ) # XXX these are more accurate if opaque - self.addr_jpmod = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-modality') - #addr_jprai = address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-raw-anat-index') - self.addr_jpnai1 = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1') - self.addr_jpnain1 = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1-min') - self.addr_jpnaix1 = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1-max') - self.addr_jpnai = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2') - self.addr_jpnain = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2-min') - self.addr_jpnaix = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2-max') - self.addr_jpsuid = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-subject-id') - self.addr_jpsaid = q.address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-sample-id') - - self.addr_jpspec = q.address_from_fadd_type_fadd("json-path-with-types", "#/local/tom-made-it-up/species") - self.addr_jpsaty = q.address_from_fadd_type_fadd("json-path-with-types", "#/local/tom-made-it-up/sample_type") + self.addr_jpmod = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-modality' + ) + # addr_jprai = address_from_fadd_type_fadd('json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-raw-anat-index') + self.addr_jpnai1 = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1' + ) + self.addr_jpnain1 = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1-min' + ) + self.addr_jpnaix1 = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v1-max' + ) + self.addr_jpnai = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2' + ) + self.addr_jpnain = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2-min' + ) + self.addr_jpnaix = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-norm-anat-index-v2-max' + ) + self.addr_jpsuid = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-subject-id' + ) + self.addr_jpsaid = q.address_from_fadd_type_fadd( + 'json-path-with-types', '#/path-metadata/data/#int/dataset_relative_path#derive-sample-id' + ) + + self.addr_jpspec = q.address_from_fadd_type_fadd('json-path-with-types', '#/local/tom-made-it-up/species') + self.addr_jpsaty = q.address_from_fadd_type_fadd('json-path-with-types', '#/local/tom-made-it-up/sample_type') # future version when we actually have the metadata files # addr_jpmod = address_from_fadd_type_fadd('json-path-with-types', '#/curation-export/manifest/#int/modality') @@ -361,13 +396,13 @@ def __init__(self, queries): # addr_jpsuid = address_from_fadd_type_fadd('json-path-with-types', '#/curation-export/subjects/#int/id_sub') # addr_jpsaid = address_from_fadd_type_fadd('json-path-with-types', '#/curation-export/samples/#int/id_sam') - self.addr_const_null = q.address_from_fadd_type_fadd("constant", None) + self.addr_const_null = q.address_from_fadd_type_fadd('constant', None) self.qd_nvlai1 = q.desc_quant_from_label('vagus level anatomical location distance index normalized v1') self.qd_nvlain1 = q.desc_quant_from_label('vagus level anatomical location distance index normalized v1 min') self.qd_nvlaix1 = q.desc_quant_from_label('vagus level anatomical location distance index normalized v1 max') - #qd_rai = desc_quant_from_label('reva ft sample anatomical location distance index raw') + # qd_rai = desc_quant_from_label('reva ft sample anatomical location distance index raw') self.qd_nai1 = q.desc_quant_from_label('reva ft sample anatomical location distance index normalized v1') self.qd_nain1 = q.desc_quant_from_label('reva ft sample anatomical location distance index normalized v1 min') self.qd_naix1 = q.desc_quant_from_label('reva ft sample anatomical location distance index normalized v1 max') @@ -382,7 +417,9 @@ def __init__(self, queries): self.cd_mod = q.desc_cat_from_label_domain_label('hasDataAboutItModality', None) self.cd_obj = q.desc_cat_from_label_domain_label('hasAssociatedObject', None) - self.cd_bot = q.desc_cat_from_label_domain_label('bottom', None) # we just need something we can reference that points to null so we can have a refernce to all the objects, XXX but it can't actually be bottom because bottom by definition relates no entities + self.cd_bot = q.desc_cat_from_label_domain_label( + 'bottom', None + ) # we just need something we can reference that points to null so we can have a refernce to all the objects, XXX but it can't actually be bottom because bottom by definition relates no entities self.id_human = q.desc_inst_from_label('human') self.id_nerve = q.desc_inst_from_label('nerve') @@ -394,14 +431,14 @@ def __init__(self, queries): 'nerve': self.id_nerve, 'nerve-volume': self.id_nerve_volume, 'nerve-cross-section': self.id_nerve_cross_section, - 'fascicle-cross-section': self.id_fascicle_cross_section + 'fascicle-cross-section': self.id_fascicle_cross_section, } - self.ct_mod = q.cterm_from_label("microct") # lol ct ct - self.ct_hack = q.cterm_from_label("hack-associate-some-value") + self.ct_mod = q.cterm_from_label('microct') # lol ct ct + self.ct_hack = q.cterm_from_label('hack-associate-some-value') self.luct = { - "ct-hack": self.ct_hack, - "microct": self.ct_mod, + 'ct-hack': self.ct_hack, + 'microct': self.ct_mod, } @@ -411,20 +448,30 @@ class Inserts: def ingest(dataset_uuid, extract_fun, session, commit=False, dev=False, values_args=None, **kwargs): - """ generic ingest workflow - this_dataset_updated_uuid might not be needed in future, - add a kwarg to control it maybe? + """generic ingest workflow + this_dataset_updated_uuid might not be needed in future, + add a kwarg to control it maybe? """ - ocdn = " ON CONFLICT DO NOTHING" if dev else "" + ocdn = ' ON CONFLICT DO NOTHING' if dev else '' if extract_fun is None and values_args is None: - raise TypeError("need one of extract_fun or values_args") - - (updated_transitive, values_objects, values_dataset_object, - make_values_instances, make_values_parents, - make_void, make_vocd, make_voqd, make_values_cat, make_values_quant - ) = extract_fun(dataset_uuid, **kwargs) if values_args is None else values_args + raise TypeError('need one of extract_fun or values_args') + + ( + updated_transitive, + values_objects, + values_dataset_object, + make_values_instances, + make_values_parents, + make_void, + make_vocd, + make_voqd, + make_values_cat, + make_values_quant, + ) = ( + extract_fun(dataset_uuid, **kwargs) if values_args is None else values_args + ) q = Queries(session) i = InternalIds(q) @@ -433,15 +480,22 @@ def ingest(dataset_uuid, extract_fun, session, commit=False, dev=False, values_a values_instances = make_values_instances(i) res0 = session.execute( - sql_text("INSERT INTO objects (id, id_type) VALUES (:id, :id_type) ON CONFLICT DO NOTHING"), - dict(id=dataset_uuid, id_type="dataset"), + sql_text('INSERT INTO objects (id, id_type) VALUES (:id, :id_type) ON CONFLICT DO NOTHING'), + dict(id=dataset_uuid, id_type='dataset'), ) # oh dear https://stackoverflow.com/questions/34708509/how-to-use-returning-with-on-conflict-in-postgresql if updated_transitive: res1 = session.execute( - sql_text("WITH ins AS (INSERT INTO objects_internal (type, dataset, updated_transitive, label) VALUES ('path-metadata', :dataset, :updated_transitive, :label) ON CONFLICT DO NOTHING RETURNING id) SELECT id FROM ins UNION ALL SELECT id FROM objects_internal WHERE type = 'path-metadata' AND dataset = :dataset AND updated_transitive = :updated_transitive"), # TODO see whether we actually need union all here or whether union by itself is sufficient - dict(dataset=dataset_uuid, updated_transitive=updated_transitive, label=f'test-load-for-f001 {isoformat(updated_transitive)}')) + sql_text( + "WITH ins AS (INSERT INTO objects_internal (type, dataset, updated_transitive, label) VALUES ('path-metadata', :dataset, :updated_transitive, :label) ON CONFLICT DO NOTHING RETURNING id) SELECT id FROM ins UNION ALL SELECT id FROM objects_internal WHERE type = 'path-metadata' AND dataset = :dataset AND updated_transitive = :updated_transitive" + ), # TODO see whether we actually need union all here or whether union by itself is sufficient + dict( + dataset=dataset_uuid, + updated_transitive=updated_transitive, + label=f'test-load-for-f001 {isoformat(updated_transitive)}', + ), + ) # it is better to use this approach for all top down information # just assume that it is coming from some combination of the metadata files and the file system @@ -456,23 +510,26 @@ def ingest(dataset_uuid, extract_fun, session, commit=False, dev=False, values_a if updated_transitive: res1_1 = session.execute( - sql_text('INSERT INTO objects (id, id_type, id_internal) VALUES (:id, :id_type, :id) ON CONFLICT DO NOTHING'), # FIXME bad ocdn here - dict(id=this_dataset_updated_uuid, id_type='quantdb')) + sql_text( + 'INSERT INTO objects (id, id_type, id_internal) VALUES (:id, :id_type, :id) ON CONFLICT DO NOTHING' + ), # FIXME bad ocdn here + dict(id=this_dataset_updated_uuid, id_type='quantdb'), + ) vt, params = makeParamsValues(values_objects) - session.execute(sql_text(f"INSERT INTO objects (id, id_type, id_file) VALUES {vt}{ocdn}"), params) + session.execute(sql_text(f'INSERT INTO objects (id, id_type, id_file) VALUES {vt}{ocdn}'), params) vt, params = makeParamsValues(values_dataset_object) - session.execute(sql_text(f"INSERT INTO dataset_object (dataset, object) VALUES {vt}{ocdn}"), params) + session.execute(sql_text(f'INSERT INTO dataset_object (dataset, object) VALUES {vt}{ocdn}'), params) vt, params = makeParamsValues(values_instances) session.execute( - sql_text(f"INSERT INTO values_inst (dataset, id_formal, type, desc_inst, id_sub, id_sam) VALUES {vt}{ocdn}"), + sql_text(f'INSERT INTO values_inst (dataset, id_formal, type, desc_inst, id_sub, id_sam) VALUES {vt}{ocdn}'), params, ) # inserts that depend on instances having already been inserted - #ilt = q.insts_from_dataset_ids(dataset_uuid, [f for d, f, *rest in values_instances]) + # ilt = q.insts_from_dataset_ids(dataset_uuid, [f for d, f, *rest in values_instances]) # get all instances in a dataset since values_inst only includes instances we plan to insert # not those that were already inserted that we want to add values for ilt = q.insts_from_dataset(dataset_uuid) @@ -482,11 +539,11 @@ def ingest(dataset_uuid, extract_fun, session, commit=False, dev=False, values_a values_qv = make_values_quant(this_dataset_updated_uuid, i, luinst) vt, params = makeParamsValues(values_parents) - session.execute(sql_text(f"INSERT INTO instance_parent VALUES {vt}{ocdn}"), params) + session.execute(sql_text(f'INSERT INTO instance_parent VALUES {vt}{ocdn}'), params) vt, params = makeParamsValues(void) session.execute( - sql_text(f"INSERT INTO obj_desc_inst (object, desc_inst, addr_field, addr_desc_inst) VALUES {vt}{ocdn}"), params + sql_text(f'INSERT INTO obj_desc_inst (object, desc_inst, addr_field, addr_desc_inst) VALUES {vt}{ocdn}'), params ) if vocd: @@ -495,20 +552,31 @@ def ingest(dataset_uuid, extract_fun, session, commit=False, dev=False, values_a if voqd: vt, params = makeParamsValues(voqd) - session.execute(sql_text(f'INSERT INTO obj_desc_quant (object, desc_quant, addr_field) VALUES {vt}{ocdn}'), params) + session.execute( + sql_text(f'INSERT INTO obj_desc_quant (object, desc_quant, addr_field) VALUES {vt}{ocdn}'), params + ) if values_cv: vt, params = makeParamsValues(values_cv) - session.execute(sql_text(f'INSERT INTO values_cat (value_open, value_controlled, object, desc_inst, desc_cat, instance) VALUES {vt}{ocdn}'), params) + session.execute( + sql_text( + f'INSERT INTO values_cat (value_open, value_controlled, object, desc_inst, desc_cat, instance) VALUES {vt}{ocdn}' + ), + params, + ) if values_qv: vt, params, bindparams = makeParamsValues( # FIXME LOL the types spec here is atrocious ... but it does work ... # XXX and barring the unfortunate case, which we have now encountered where # now fixed in the local impl - values_qv, row_types=(None, None, None, None, None, JSONB)) + values_qv, + row_types=(None, None, None, None, None, JSONB), + ) - t = sql_text(f'INSERT INTO values_quant (value, object, desc_inst, desc_quant, instance, value_blob) VALUES {vt}{ocdn}') + t = sql_text( + f'INSERT INTO values_quant (value, object, desc_inst, desc_quant, instance, value_blob) VALUES {vt}{ocdn}' + ) tin = t.bindparams(*bindparams) session.execute(tin, params) @@ -520,14 +588,14 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): if source_local: with open( pathlib.Path( - f"~/.local/share/sparcur/export/datasets/{dataset_uuid}/LATEST/path-metadata.json" + f'~/.local/share/sparcur/export/datasets/{dataset_uuid}/LATEST/path-metadata.json' ).expanduser(), - "rt", + 'rt', ) as f: blob = json.load(f) else: - resp = requests.get(f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json") + resp = requests.get(f'https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json') try: blob = resp.json() @@ -535,8 +603,8 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): breakpoint() raise e - for j in blob["data"]: - j["type"] = "pathmeta" + for j in blob['data']: + j['type'] = 'pathmeta' ir = fromJson(blob) @@ -545,27 +613,29 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): jpx = [r for r in ir['data'] if 'mimetype' in r and r['mimetype'] == 'image/jpx'] exts = [ext_pmeta(j) for j in jpx] - #hrm = sorted(exts, key=lambda j: j['raw_anat_index']) - #max_rai = max([e['raw_anat_index'] for e in exts]) - #import math - #log_max_rai = math.log10(max_rai) + # hrm = sorted(exts, key=lambda j: j['raw_anat_index']) + # max_rai = max([e['raw_anat_index'] for e in exts]) + # import math + # log_max_rai = math.log10(max_rai) # normalize the index by mapping distinct values to the integers nondist = sorted([e['raw_anat_index_v2'] for e in exts]) - lin_distinct = {v:i for i, v in enumerate(sorted(set([e['raw_anat_index_v2'] for e in exts])))} + lin_distinct = {v: i for i, v in enumerate(sorted(set([e['raw_anat_index_v2'] for e in exts])))} max_distinct = len(lin_distinct) mdp1 = max_distinct + 0.1 # to simplify adding overlap dd = defaultdict(list) for e in exts: - #e['norm_anat_index'] = math.log10(e['raw_anat_index']) / log_max_rai + # e['norm_anat_index'] = math.log10(e['raw_anat_index']) / log_max_rai pos = lin_distinct[e['raw_anat_index_v2']] - e['norm_anat_index_v2'] = (pos + 0.55) / mdp1 - e['norm_anat_index_v2_min'] = pos / mdp1 - e['norm_anat_index_v2_max'] = (pos + 1.1) / mdp1 # ensure there is overlap between section for purposes of testing + e['norm_anat_index_v2'] = (pos + 0.55) / mdp1 + e['norm_anat_index_v2_min'] = pos / mdp1 + e['norm_anat_index_v2_max'] = ( + pos + 1.1 + ) / mdp1 # ensure there is overlap between section for purposes of testing # TODO norm_anat_index_min # TODO norm_anat_index_max - dd[e["dataset"], e["sample"]].append(e) + dd[e['dataset'], e['sample']].append(e) inst_obj_index = dict(dd) max_nai = max([e['norm_anat_index_v2'] for e in exts]) @@ -583,7 +653,7 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): _exts = exts exts = mexts x = list(range(len(exts))) - #ry = sorted([e['raw_anat_index'] for e in exts]) + # ry = sorted([e['raw_anat_index'] for e in exts]) idy = [b for a, b in sorted([(e['norm_anat_index_v2'], e['sample']) for e in exts])] ny = sorted([e['norm_anat_index_v2'] for e in exts]) nyn = sorted([e['norm_anat_index_v2_min'] for e in exts]) @@ -601,50 +671,50 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): seaborn.scatterplot(x=x[:end], y=nyn[:end], label='min') seaborn.scatterplot(x=x[:end], y=nyx[:end], label='max') _sid = blob['data'][0]['basename'].split('-')[-1].strip() - #if _sid == 'f003': - #breakpoint() + # if _sid == 'f003': + # breakpoint() plt.title(f'norm-anat-index-v2 for {_sid}') plt.xlabel('nth sample') plt.ylabel('normalized anatomical index v2') plt.legend(loc='upper left') - #plt.savefig(f'ft-norm-anat-index-v2-{dataset_uuid[:4]}.png') + # plt.savefig(f'ft-norm-anat-index-v2-{dataset_uuid[:4]}.png') plt.savefig(f'ft-norm-anat-index-v2-{_sid}.png') exts = _exts - datasets = {i.uuid: {"id_type": i.type} for e in exts if (i := e["dataset"])} + datasets = {i.uuid: {'id_type': i.type} for e in exts if (i := e['dataset'])} packages = { i.uuid: { - "id_type": i.type, - "id_file": e["file_id"], + 'id_type': i.type, + 'id_file': e['file_id'], } for e in exts - if (i := e["object"]) + if (i := e['object']) } objects = {**datasets, **packages} - dataset_object = list(set((d.uuid, o.uuid) for e in exts if (d := e["dataset"]) and (o := e["object"]))) + dataset_object = list(set((d.uuid, o.uuid) for e in exts if (d := e['dataset']) and (o := e['object']))) subjects = { k: { - "type": "subject", - "desc_inst": "human", - "id_sub": k[1], + 'type': 'subject', + 'desc_inst': 'human', + 'id_sub': k[1], } - for k in sorted(set((e["dataset"], e["subject"]) for e in exts)) + for k in sorted(set((e['dataset'], e['subject']) for e in exts)) } segments = { k[:2]: { - "type": "sample", # FIXME vs below ??? - "desc_inst": "nerve-volume", # FIXME should this be nerve-segment and then we use nerve-volume for the 1:1 with files? - "id_sub": k[-1], - "id_sam": k[1], + 'type': 'sample', # FIXME vs below ??? + 'desc_inst': 'nerve-volume', # FIXME should this be nerve-segment and then we use nerve-volume for the 1:1 with files? + 'id_sub': k[-1], + 'id_sam': k[1], } - for k in sorted(set((e["dataset"], e["sample"], e["subject"]) for e in exts)) + for k in sorted(set((e['dataset'], e['sample'], e['subject']) for e in exts)) } - parents = sorted(set((e["dataset"],) + p for e in exts for p in e["parents"])) + parents = sorted(set((e['dataset'],) + p for e in exts for p in e['parents'])) sam_other = { - p[:2]: {"type": "sample", "desc_inst": "nerve", "id_sub": p[-1], "id_sam": p[1]} + p[:2]: {'type': 'sample', 'desc_inst': 'nerve', 'id_sub': p[-1], 'id_sam': p[1]} for p in parents if p[:2] not in segments } @@ -652,17 +722,21 @@ def extract_reva_ft(dataset_uuid, source_local=False, visualize=True): instances = {**subjects, **samples} values_objects = [ - (i, o["id_type"], o["id_file"] if "id_file" in o else None) + (i, o['id_type'], o['id_file'] if 'id_file' in o else None) for i, o in objects.items() - if o["id_type"] != "dataset" # already did it above + if o['id_type'] != 'dataset' # already did it above ] values_dataset_object = dataset_object def make_values_instances(i): values_instances = [ - (d.uuid, f, inst['type'], i.luid[inst['desc_inst']], - inst['id_sub'] if 'id_sub' in inst else None, - inst['id_sam'] if 'id_sam' in inst else None, + ( + d.uuid, + f, + inst['type'], + i.luid[inst['desc_inst']], + inst['id_sub'] if 'id_sub' in inst else None, + inst['id_sam'] if 'id_sam' in inst else None, ) for (d, f), inst in instances.items() ] @@ -699,7 +773,7 @@ def make_void(this_dataset_updated_uuid, i): None, ) # XXX FIXME this is the only way I can think to do this right now ? for o, b in objects.items() - if b["id_type"] == "package" + if b['id_type'] == 'package' ] return void @@ -709,8 +783,11 @@ def make_vocd(this_dataset_updated_uuid, i): # FIXME this reveals that there are cases where we may not have void for a single file or that the id comes from context and is not embedded # figuring out how to turn that around is going to take a bit of thinking (this_dataset_updated_uuid, i.cd_mod, i.addr_jpmod), - ] + [(o, i.cd_obj, i.addr_const_null) # XXX FIXME this is the only way I can think to do this right now ? - for o, b in objects.items() if b['id_type'] == 'package'] + ] + [ + (o, i.cd_obj, i.addr_const_null) # XXX FIXME this is the only way I can think to do this right now ? + for o, b in objects.items() + if b['id_type'] == 'package' + ] return vocd @@ -738,17 +815,18 @@ def make_values_cat(this_dataset_updated_uuid, i, luinst): # e['object'].uuid, # FIXME still not right this comes from the updated latest i.id_nerve_volume, cd, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[e["dataset"].uuid, e["sample"]], # get us the instance + luinst[e['dataset'].uuid, e['sample']], # get us the instance ) for e in exts - for k, cd in (("modality", i.cd_mod),) + for k, cd in (('modality', i.cd_mod),) ] + [ - (None, - i.ct_hack, - e['object'].uuid, - i.id_nerve_volume, - i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[e['dataset'].uuid, e['sample']], # get us the instance + ( + None, + i.ct_hack, + e['object'].uuid, + i.id_nerve_volume, + i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) + luinst[e['dataset'].uuid, e['sample']], # get us the instance ) for e in exts ] @@ -763,24 +841,31 @@ def make_values_quant(this_dataset_updated_uuid, i, luinst): this_dataset_updated_uuid, i.id_nerve_volume, qd, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[e["dataset"].uuid, e["sample"]], # get us the instance + luinst[e['dataset'].uuid, e['sample']], # get us the instance e[k], ) for e in exts for k, qd in ( - #('raw_anat_index', qd_rai), # XXX this is a bad place to store object -> field -> qd mappings also risks mismatch on address - ('norm_anat_index_v2', i.qd_nai), - ('norm_anat_index_v2_min', i.qd_nain), - ('norm_anat_index_v2_max', i.qd_naix), + # ('raw_anat_index', qd_rai), # XXX this is a bad place to store object -> field -> qd mappings also risks mismatch on address + ('norm_anat_index_v2', i.qd_nai), + ('norm_anat_index_v2_min', i.qd_nain), + ('norm_anat_index_v2_max', i.qd_naix), ) ] return values_qv - return (updated_transitive, values_objects, values_dataset_object, - make_values_instances, make_values_parents, - make_void, make_vocd, make_voqd, - make_values_cat, make_values_quant, - ) + return ( + updated_transitive, + values_objects, + values_dataset_object, + make_values_instances, + make_values_parents, + make_void, + make_vocd, + make_voqd, + make_values_cat, + make_values_quant, + ) # this is where things get annoying with needing selects on instance measured @@ -792,36 +877,41 @@ def values_objects_from_objects(objects): if o['id_type'] != 'dataset' # already did it above ] + def ext_values(exts): - datasets = {i.uuid: {'id_type': i.type} - for e in exts - if (i := e['dataset']) - } + datasets = {i.uuid: {'id_type': i.type} for e in exts if (i := e['dataset'])} - packages = {i.uuid: { - 'id_type': i.type, - 'id_file': e['file_id'], + packages = { + i.uuid: { + 'id_type': i.type, + 'id_file': e['file_id'], + } + for e in exts + if (i := e['object']) } - for e in exts - if (i := e['object']) - } objects = {**datasets, **packages} - dataset_object = list(set((d.uuid, o.uuid) for e in exts - if (d := e['dataset']) and (o := e['object']) - )) - - subjects = {k: {'type': 'subject', - 'desc_inst': 'human', # FIXME hardcoded - 'id_sub': k[1], - } for k in sorted(set((e['dataset'], e['subject']) for e in exts))} + dataset_object = list(set((d.uuid, o.uuid) for e in exts if (d := e['dataset']) and (o := e['object']))) + + subjects = { + k: { + 'type': 'subject', + 'desc_inst': 'human', # FIXME hardcoded + 'id_sub': k[1], + } + for k in sorted(set((e['dataset'], e['subject']) for e in exts)) + } parents = sorted(set((e['dataset'],) + p for e in exts for p in e['parents'])) - samples = {k[:2]: {'type': 'sample', - 'desc_inst': 'nerve-cross-section', # FIXME hardcoded - 'id_sub': k[-1], - 'id_sam': k[1], - } for k in sorted(set((e['dataset'], e['sample'], e['subject']) for e in exts))} + samples = { + k[:2]: { + 'type': 'sample', + 'desc_inst': 'nerve-cross-section', # FIXME hardcoded + 'id_sub': k[-1], + 'id_sam': k[1], + } + for k in sorted(set((e['dataset'], e['sample'], e['subject']) for e in exts)) + } instances = {**subjects, **samples} @@ -857,31 +947,36 @@ def extract_demo_jp2(dataset_uuid, source_local=False): def make_values_instances(i): values_instances = [ - (d.uuid, f, inst['type'], i.luid[inst['desc_inst']], - inst['id_sub'] if 'id_sub' in inst else None, - inst['id_sam'] if 'id_sam' in inst else None, + ( + d.uuid, + f, + inst['type'], + i.luid[inst['desc_inst']], + inst['id_sub'] if 'id_sub' in inst else None, + inst['id_sam'] if 'id_sam' in inst else None, ) - for (d, f), inst in instances.items()] + for (d, f), inst in instances.items() + ] return values_instances def make_values_parents(luinst): - """ need the lookup for instances """ - values_parents = [ - (luinst[d.uuid, child], luinst[d.uuid, parent]) - for d, child, parent in parents] + """need the lookup for instances""" + values_parents = [(luinst[d.uuid, child], luinst[d.uuid, parent]) for d, child, parent in parents] return values_parents def make_void(this_dataset_updated_uuid, i): # we don't derive anything from the dataset updated uuid so nothing goes here - void = [(o, i.id_nerve_cross_section, i.addr_const_null, None) - for o, b in objects.items() if b['id_type'] == 'package'] + void = [ + (o, i.id_nerve_cross_section, i.addr_const_null, None) + for o, b in objects.items() + if b['id_type'] == 'package' + ] return void def make_vocd(this_dataset_updated_uuid, i): # we don't derive anything from the dataset updated uuid so nothing goes here - vocd = [(o, i.cd_obj, i.addr_const_null) - for o, b in objects.items() if b['id_type'] == 'package'] + vocd = [(o, i.cd_obj, i.addr_const_null) for o, b in objects.items() if b['id_type'] == 'package'] return vocd def make_voqd(this_dataset_updated_uuid, i): @@ -891,13 +986,14 @@ def make_voqd(this_dataset_updated_uuid, i): def make_values_cat(this_dataset_updated_uuid, i, luinst): # we don't derive anything from the dataset updated uuid so nothing goes here values_cv = [ - (None, - i.ct_hack, - e['object'].uuid, - i.id_nerve_cross_section, - i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[e['dataset'].uuid, e['sample']], # get us the instance - ) + ( + None, + i.ct_hack, + e['object'].uuid, + i.id_nerve_cross_section, + i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) + luinst[e['dataset'].uuid, e['sample']], # get us the instance + ) for e in exts ] return values_cv @@ -906,20 +1002,31 @@ def make_values_quant(this_dataset_updated_uuid, i, luinst): values_qv = [] return values_qv - return (updated_transitive, values_objects, values_dataset_object, - make_values_instances, make_values_parents, - make_void, make_vocd, make_voqd, - make_values_cat, make_values_quant, - ) + return ( + updated_transitive, + values_objects, + values_dataset_object, + make_values_instances, + make_values_parents, + make_void, + make_vocd, + make_voqd, + make_values_cat, + make_values_quant, + ) -import scipy import augpathlib as aug +import scipy from sparcur.datasets import SamplesFilePath + + def extract_demo(dataset_uuid, source_local=True): dataset_id = RemoteId('dataset:' + dataset_uuid) - _dsp = ('/mnt/str/tom/sparc-datasets/55c5b69c-a5b8-4881-a105-e4048af26fa5/SPARC/' - 'Quantified morphology of the human vagus nerve with anti-claudin-1/') + _dsp = ( + '/mnt/str/tom/sparc-datasets/55c5b69c-a5b8-4881-a105-e4048af26fa5/SPARC/' + 'Quantified morphology of the human vagus nerve with anti-claudin-1/' + ) p = _dsp + 'derivative/CadaverVNMorphology_OutputMetrics.mat' _p = aug.AugmentedPath(_dsp + 'samples.xlsx') sp = SamplesFilePath(_p) @@ -960,7 +1067,7 @@ def level_to_vdd(level): msg = f'unknown vagus level {level}' raise NotImplementedError(msg) - sane_data = [{k:fk(v[0]) for k, fk, v in zip(ks, fks, _)} for _ in zip(*[m[k][0] for k in ks])] + sane_data = [{k: fk(v[0]) for k, fk, v in zip(ks, fks, _)} for _ in zip(*[m[k][0] for k in ks])] instances = {} parents = [] nerve_qvs = [] @@ -981,13 +1088,15 @@ def level_to_vdd(level): 'id_sam': id_sam, } vdd = level_to_vdd(sd['level']) - nerve_qvs.append({ - **vdd, - 'id_formal': id_sam, - 'desc_inst': 'nerve-cross-section', - 'diameter-um': sd['dNerve_um'], - 'number-of-fascicles': sd['NFasc'], - }) + nerve_qvs.append( + { + **vdd, + 'id_formal': id_sam, + 'desc_inst': 'nerve-cross-section', + 'diameter-um': sd['dNerve_um'], + 'number-of-fascicles': sd['NFasc'], + } + ) for i, fdum in enumerate(sd['dFasc_um']): id_formal = f'fasc-{id_sam}-{i}' @@ -998,12 +1107,14 @@ def level_to_vdd(level): 'id_sub': id_sub, 'id_sam': id_sam, } - fasc_qvs.append({ - **vdd, - 'id_formal': id_formal, - 'desc_inst': 'fascicle-cross-section', - 'diameter-um': fdum, - }) + fasc_qvs.append( + { + **vdd, + 'id_formal': id_formal, + 'desc_inst': 'fascicle-cross-section', + 'diameter-um': fdum, + } + ) updated_transitive = None @@ -1013,9 +1124,13 @@ def level_to_vdd(level): def make_values_instances(i): values_instances = [ - (d.uuid, f, inst['type'], i.luid[inst['desc_inst']], - inst['id_sub'] if 'id_sub' in inst else None, - inst['id_sam'] if 'id_sam' in inst else None, + ( + d.uuid, + f, + inst['type'], + i.luid[inst['desc_inst']], + inst['id_sub'] if 'id_sub' in inst else None, + inst['id_sam'] if 'id_sam' in inst else None, ) for (d, f), inst in instances.items() if inst['desc_inst'] != 'nerve-cross-section' # XXX already handled from the jp2 side @@ -1023,22 +1138,34 @@ def make_values_instances(i): return values_instances def make_values_parents(luinst): - """ need the lookup for instances """ - values_parents = [ - (luinst[d.uuid, child], luinst[d.uuid, parent]) - for d, child, parent in parents] + """need the lookup for instances""" + values_parents = [(luinst[d.uuid, child], luinst[d.uuid, parent]) for d, child, parent in parents] return values_parents def make_void(this_dataset_updated_uuid, i): - void = [(o, i.id_nerve_cross_section, i.addr_dFasc_um_idx, None) # FIXME add_const_null is wrong, should be "from curator" - for o, b in objects.items() if b['id_type'] == 'package' - ] + [(o, i.id_fascicle_cross_section, i.addr_dFasc_um_idx, None) # FIXME add_const_null is wrong, should be "from curator" - for o, b in objects.items() if b['id_type'] == 'package'] + void = [ + ( + o, + i.id_nerve_cross_section, + i.addr_dFasc_um_idx, + None, + ) # FIXME add_const_null is wrong, should be "from curator" + for o, b in objects.items() + if b['id_type'] == 'package' + ] + [ + ( + o, + i.id_fascicle_cross_section, + i.addr_dFasc_um_idx, + None, + ) # FIXME add_const_null is wrong, should be "from curator" + for o, b in objects.items() + if b['id_type'] == 'package' + ] return void def make_vocd(this_dataset_updated_uuid, i): - vocd = [(o, i.cd_obj, i.addr_const_null) - for o, b in objects.items() if b['id_type'] == 'package'] + vocd = [(o, i.cd_obj, i.addr_const_null) for o, b in objects.items() if b['id_type'] == 'package'] return vocd def make_voqd(this_dataset_updated_uuid, i): @@ -1054,22 +1181,24 @@ def make_voqd(this_dataset_updated_uuid, i): def make_values_cat(this_dataset_updated_uuid, i, luinst): values_cv = [ - (None, - i.ct_hack, - obj_uuid, - i.id_nerve_cross_section, - i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[dataset_uuid, id_formal], # get us the instance - ) + ( + None, + i.ct_hack, + obj_uuid, + i.id_nerve_cross_section, + i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) + luinst[dataset_uuid, id_formal], # get us the instance + ) for id_formal in [e['id_formal'] for e in nerve_qvs] ] + [ - (None, - i.ct_hack, - obj_uuid, - i.id_fascicle_cross_section, - i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) - luinst[dataset_uuid, id_formal], # get us the instance - ) + ( + None, + i.ct_hack, + obj_uuid, + i.id_fascicle_cross_section, + i.cd_obj, # if we mess this up the fk ok obj_desc_cat will catch it :) + luinst[dataset_uuid, id_formal], # get us the instance + ) for id_formal in [e['id_formal'] for e in fasc_qvs] ] @@ -1077,39 +1206,50 @@ def make_values_cat(this_dataset_updated_uuid, i, luinst): def make_values_quant(this_dataset_updated_uuid, i, luinst): values_qv = [ - (e[k], - obj_uuid, - i.luid[e['desc_inst']], - qd, - luinst[dataset_uuid, e['id_formal']], - e[k], - ) - for e, k, qd in - [ + ( + e[k], + obj_uuid, + i.luid[e['desc_inst']], + qd, + luinst[dataset_uuid, e['id_formal']], + e[k], + ) + for e, k, qd in [ (e, k, qd) - for e in nerve_qvs - for k, qd in ( + for e in nerve_qvs + for k, qd in ( ('number-of-fascicles', i.qd_count), # FIXME population of thing counts within context ('diameter-um', i.qd_nerve_cs_diameter_um), ('vd', i.qd_nvlai1), ('vd-min', i.qd_nvlain1), ('vd-max', i.qd_nvlaix1), - )] + [ + ) + ] + + [ (e, k, qd) - for e in fasc_qvs - for k, qd in ( + for e in fasc_qvs + for k, qd in ( ('diameter-um', i.qd_fasc_cs_diameter_um), ('vd', i.qd_nvlai1), ('vd-min', i.qd_nvlain1), ('vd-max', i.qd_nvlaix1), - )]] + ) + ] + ] return values_qv - return (updated_transitive, values_objects, values_dataset_object, - make_values_instances, make_values_parents, - make_void, make_vocd, make_voqd, - make_values_cat, make_values_quant, - ) + return ( + updated_transitive, + values_objects, + values_dataset_object, + make_values_instances, + make_values_parents, + make_void, + make_vocd, + make_voqd, + make_values_cat, + make_values_quant, + ) def extract_template(dataset_uuid, source_local=True): @@ -1144,11 +1284,18 @@ def make_values_cat(this_dataset_updated_uuid, i, luinst): def make_values_quant(this_dataset_updated_uuid, i, luinst): return values_qv - return (updated_transitive, values_objects, values_dataset_object, - make_values_instances, make_values_parents, - make_void, make_vocd, make_voqd, - make_values_cat, make_values_quant, - ) + return ( + updated_transitive, + values_objects, + values_dataset_object, + make_values_instances, + make_values_parents, + make_void, + make_vocd, + make_voqd, + make_values_cat, + make_values_quant, + ) def ingest_demo(session, source_local=True, do_insert=True, commit=False, dev=False): @@ -1163,12 +1310,12 @@ def ingest_demo_jp2(session, source_local=True, do_insert=True, commit=False, de def ingest_reva_ft_all(session, source_local=False, do_insert=True, batch=False, commit=False, dev=False): dataset_uuids = ( - "aa43eda8-b29a-4c25-9840-ecbd57598afc", # f001 + 'aa43eda8-b29a-4c25-9840-ecbd57598afc', # f001 # the rest have uuid1 issues :/ all in the undefined folder it seems, might be able to fix with a reupload - "bc4cc558-727c-4691-ae6d-498b57a10085", # f002 # XXX has a uuid1 so breaking in prod right now have to push the new pipelines - "ec6ad74e-7b59-409b-8fc7-a304319b6faf", # f003 # also uuid1 issue - "a8b2bdc7-54df-46a3-810e-83cdf33cfc3a", # f004 - "04a5fed9-7ba6-4292-b1a6-9cab5c38895f", # f005 + 'bc4cc558-727c-4691-ae6d-498b57a10085', # f002 # XXX has a uuid1 so breaking in prod right now have to push the new pipelines + 'ec6ad74e-7b59-409b-8fc7-a304319b6faf', # f003 # also uuid1 issue + 'a8b2bdc7-54df-46a3-810e-83cdf33cfc3a', # f004 + '04a5fed9-7ba6-4292-b1a6-9cab5c38895f', # f005 ) batched = [] @@ -1189,8 +1336,8 @@ def ingest_reva_ft_all(session, source_local=False, do_insert=True, batch=False, def main(source_local=False, commit=False, echo=True): from quantdb.config import auth - dbkwargs = {k: auth.get(f"db-{k}") for k in ("user", "host", "port", "database")} # TODO integrate with cli options - dbkwargs["dbuser"] = dbkwargs.pop("user") + dbkwargs = {k: auth.get(f'db-{k}') for k in ('user', 'host', 'port', 'database')} # TODO integrate with cli options + dbkwargs['dbuser'] = dbkwargs.pop('user') engine = create_engine(dbUri(**dbkwargs)) engine.echo = echo session = Session(engine) @@ -1226,5 +1373,5 @@ def main(source_local=False, commit=False, echo=True): engine.dispose() -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/quantdb/main.py b/quantdb/main.py index deb5246..1b3ab8f 100644 --- a/quantdb/main.py +++ b/quantdb/main.py @@ -5,9 +5,9 @@ app = FastAPI() -app.mount("/", WSGIMiddleware(flask_app)) +app.mount('/', WSGIMiddleware(flask_app)) -if __name__ == "__main__": +if __name__ == '__main__': import uvicorn - uvicorn.run("quantdb.main:app", host="127.0.0.1", port=8990, reload=True) + uvicorn.run('quantdb.main:app', host='127.0.0.1', port=8990, reload=True) diff --git a/quantdb/mysql_app/__init__.py b/quantdb/mysql_app/__init__.py index 4064ab4..e73dd2a 100644 --- a/quantdb/mysql_app/__init__.py +++ b/quantdb/mysql_app/__init__.py @@ -2,4 +2,4 @@ from . import database -__all__ = ["database"] +__all__ = ['database'] diff --git a/quantdb/router.py b/quantdb/router.py index 0d2913c..71b0c03 100644 --- a/quantdb/router.py +++ b/quantdb/router.py @@ -8,14 +8,14 @@ from quantdb.api_server import app as quantdb_flask_app app = FastAPI() -app.mount("/quantdb", WSGIMiddleware(quantdb_flask_app)) +app.mount('/quantdb', WSGIMiddleware(quantdb_flask_app)) # Root URL -@app.get("/") -def index() -> Literal["Hello"]: - return "Hello" +@app.get('/') +def index() -> Literal['Hello']: + return 'Hello' -if __name__ == "__main__": - uvicorn.run("router:app", host="localhost", port=8000, reload=True) +if __name__ == '__main__': + uvicorn.run('router:app', host='localhost', port=8000, reload=True) diff --git a/quantdb/utils.py b/quantdb/utils.py index 7619e64..35069ec 100644 --- a/quantdb/utils.py +++ b/quantdb/utils.py @@ -12,39 +12,39 @@ def makeSimpleLogger(name, level=logging.INFO): logger.setLevel(level) ch = logging.StreamHandler() # FileHander goes to disk - fmt = "[%(asctime)s] - %(levelname)8s - " "%(name)14s - " "%(filename)16s:%(lineno)-4d - " "%(message)s" + fmt = '[%(asctime)s] - %(levelname)8s - ' '%(name)14s - ' '%(filename)16s:%(lineno)-4d - ' '%(message)s' formatter = logging.Formatter(fmt) ch.setFormatter(formatter) logger.addHandler(ch) return logger -log = makeSimpleLogger("quantdb") -logd = log.getChild("data") +log = makeSimpleLogger('quantdb') +logd = log.getChild('data') # from pyontutils.utils_fast import setPS1 def setPS1(script__file__): try: - text = "Running " + os.path.basename(script__file__) - os.sys.stdout.write("\x1b]2;{}\x07\n".format(text)) + text = 'Running ' + os.path.basename(script__file__) + os.sys.stdout.write('\x1b]2;{}\x07\n'.format(text)) except AttributeError as e: log.exception(e) def dbUri(dbuser, host, port, database): - if hasattr(sys, "pypy_version_info"): - dialect = "psycopg2cffi" + if hasattr(sys, 'pypy_version_info'): + dialect = 'psycopg2cffi' else: - dialect = "psycopg2" - return f"postgresql+{dialect}://{dbuser}@{host}:{port}/{database}" + dialect = 'psycopg2' + return f'postgresql+{dialect}://{dbuser}@{host}:{port}/{database}' # from pyontutils.utils_fast import isoformat -def isoformat(datetime_instance, timespec="auto"): +def isoformat(datetime_instance, timespec='auto'): kwargs = {} if isinstance(datetime_instance, datetime): # don't pass timespec if type is not date not datetime - kwargs["timespec"] = timespec + kwargs['timespec'] = timespec - return datetime_instance.isoformat(**kwargs).replace(".", ",").replace("+00:00", "Z") + return datetime_instance.isoformat(**kwargs).replace('.', ',').replace('+00:00', 'Z') diff --git a/test/test_api.py b/test/test_api.py index cd8a5db..e189822 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -13,10 +13,10 @@ def test(): client = app.test_client() runner = app.test_cli_runner() - dataset_uuid = "aa43eda8-b29a-4c25-9840-ecbd57598afc" - some_object = "414886a9-9ec7-447e-b4d8-3ae42fda93b7" # XXX FAKE - actual_package_uuid = "15bcbcd5-b054-40ef-9b5c-6a260d441621" - base = "http://localhost:8989/api/1/" + dataset_uuid = 'aa43eda8-b29a-4c25-9840-ecbd57598afc' + some_object = '414886a9-9ec7-447e-b4d8-3ae42fda93b7' # XXX FAKE + actual_package_uuid = '15bcbcd5-b054-40ef-9b5c-6a260d441621' + base = 'http://localhost:8989/api/1/' urls = ( f'{base}values/inst', f'{base}values/inst?dataset={dataset_uuid}', @@ -26,12 +26,10 @@ def test(): f'{base}values/inst?dataset={dataset_uuid}&inst-parent=sub-f001', f'{base}values/inst?dataset={dataset_uuid}&inst-parent=sam-r-seg-c1&inst-parent=sam-l-seg-c1', f'{base}values/inst?desc-inst=nerve-volume', - f'{base}objects?dataset={dataset_uuid}', f'{base}objects?dataset={dataset_uuid}&aspect=distance', f'{base}objects?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5', # expect nothing f'{base}objects?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5&union-cat-quant=true', - f'{base}objects?dataset={dataset_uuid}&subject=sub-f001', f'{base}objects?subject=sub-f001', f'{base}objects?subject=sub-f001&union-cat-quant=true', @@ -46,7 +44,6 @@ def test(): f'{base}objects?aspect=distance&value-quant-min=0.5', f'{base}objects?aspect=distance&value-quant-min=0.5&source-only=true', f'{base}objects?desc-inst=nerve-volume&aspect=distance&value-quant-min=0.5&source-only=true', - # values-quant f'{base}values/quant?dataset={dataset_uuid}&aspect=distance', f'{base}values/quant?object={actual_package_uuid}&aspect=distance', @@ -54,50 +51,47 @@ def test(): f'{base}values/quant?aspect=distance-via-reva-ft-sample-id-normalized-v1', f'{base}values/quant?aspect=distance-via-reva-ft-sample-id-normalized-v1&agg-type=instance', f'{base}values/quant?aspect=distance-via-reva-ft-sample-id-normalized-v1&value-quant-min=0.4&value-quant-max=0.7', - # values-cat - f"{base}values/cat?object={actual_package_uuid}", - f"{base}values/cat?object={actual_package_uuid}&union-cat-quant=true", # shouldn't need it in this case - f"{base}values/cat-quant?object={actual_package_uuid}", - f"{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true", + f'{base}values/cat?object={actual_package_uuid}', + f'{base}values/cat?object={actual_package_uuid}&union-cat-quant=true', # shouldn't need it in this case + f'{base}values/cat-quant?object={actual_package_uuid}', + f'{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true', # values-cat-quant - f"{base}values?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5", - f"{base}values?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5&union-cat-quant=true", - f"{base}values?object={actual_package_uuid}", - f"{base}values?object={actual_package_uuid}&union-cat-quant=true", - f"{base}values/inst?object={actual_package_uuid}", - f"{base}values/inst?object={actual_package_uuid}&union-cat-quant=true", + f'{base}values?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5', + f'{base}values?dataset={dataset_uuid}&aspect=distance&value-quant-min=0.5&union-cat-quant=true', + f'{base}values?object={actual_package_uuid}', + f'{base}values?object={actual_package_uuid}&union-cat-quant=true', + f'{base}values/inst?object={actual_package_uuid}', + f'{base}values/inst?object={actual_package_uuid}&union-cat-quant=true', # prov - f"{base}values/inst?prov=true", - f"{base}values/quant?aspect=distance&prov=true", - f"{base}values/cat?object={actual_package_uuid}", - f"{base}values/cat?object={actual_package_uuid}&prov=true", # FIXME somehow this has a 3x increase in records, and non-distinct - f"{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true", - f"{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true&prov=true", - f"{base}values/cat-quant", - f"{base}values/cat-quant?prov=true", - f"{base}values/cat-quant?union-cat-quant=true", - f"{base}values/cat-quant?union-cat-quant=true&prov=true", + f'{base}values/inst?prov=true', + f'{base}values/quant?aspect=distance&prov=true', + f'{base}values/cat?object={actual_package_uuid}', + f'{base}values/cat?object={actual_package_uuid}&prov=true', # FIXME somehow this has a 3x increase in records, and non-distinct + f'{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true', + f'{base}values/cat-quant?object={actual_package_uuid}&union-cat-quant=true&prov=true', + f'{base}values/cat-quant', + f'{base}values/cat-quant?prov=true', + f'{base}values/cat-quant?union-cat-quant=true', + f'{base}values/cat-quant?union-cat-quant=true&prov=true', # desc - f"{base}desc/inst", - f"{base}desc/cat", - f"{base}desc/quant", - f"{base}desc/inst?include-unused=true", - f"{base}desc/cat?include-unused=true", - f"{base}desc/quant?include-unused=true", + f'{base}desc/inst', + f'{base}desc/cat', + f'{base}desc/quant', + f'{base}desc/inst?include-unused=true', + f'{base}desc/cat?include-unused=true', + f'{base}desc/quant?include-unused=true', # descriptor values - f"{base}terms", - f"{base}aspects", - f"{base}units", - f"{base}terms?include-unused=true", - f"{base}aspects?include-unused=true", - f"{base}units?include-unused=true", + f'{base}terms', + f'{base}aspects', + f'{base}units', + f'{base}terms?include-unused=true', + f'{base}aspects?include-unused=true', + f'{base}units?include-unused=true', # TODO maybe shapes here as well? - f'{base}terms?inst-parent=sam-r-seg-c1&inst-parent=sam-l-seg-c1', f'{base}aspects?inst-parent=sam-r-seg-c1&inst-parent=sam-l-seg-c1', f'{base}units?inst-parent=sam-r-seg-c1&inst-parent=sam-l-seg-c1', - ) # log.setLevel(9) resps = [] @@ -126,9 +120,7 @@ def test_demo_load(): dataset_uuid = '55c5b69c-a5b8-4881-a105-e4048af26fa5' package_uuid = '20720c2e-83fb-4454-bef1-1ce6a97fa748' base = 'http://localhost:8989/api/1/' - urls = ( - f'{base}values/cat-quant?desc-inst=fascicle-cross-section', - ) + urls = (f'{base}values/cat-quant?desc-inst=fascicle-cross-section',) resps = [] for url in urls: From 269b584e9730daff1657eae631409f47216184ea Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Tue, 4 Feb 2025 12:58:51 -0800 Subject: [PATCH 2/3] ignoring api.py for blue fmt due to sql str --- .pre-commit-config.yaml | 1 + quantdb/api.py | 480 +++++++++++++++++----------------------- 2 files changed, 207 insertions(+), 274 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2dabc8c..d269415 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,3 +18,4 @@ repos: hooks: - id: blue args: ["-l", "120"] + exclude: quantdb/api.py diff --git a/quantdb/api.py b/quantdb/api.py index 44d69fb..cec693f 100644 --- a/quantdb/api.py +++ b/quantdb/api.py @@ -28,39 +28,30 @@ def default(self, obj): url_sql_where = ( # TODO arity spec here + # dupes overwrite params but that is ok, this way we get the correct table alias for both cases - ( - 'object', - 'object', - 'cv.object = any(:object)', - 'cat', - ), # XXX should not use this outside values/ unless we left outer due to intersect ? - ( - 'object', - 'object', - 'qv.object = any(:object)', - 'quant', - ), # XXX should not use this outside values/ unless we left outer due to intersect ? + ('object', 'object', 'cv.object = any(:object)', 'cat'), # XXX should not use this outside values/ unless we left outer due to intersect ? + ('object', 'object', 'qv.object = any(:object)', 'quant'), # XXX should not use this outside values/ unless we left outer due to intersect ? + ('desc-inst', 'desc_inst', 'idin.label = any(:desc_inst)', 'both'), ('dataset', 'dataset', 'im.dataset = :dataset', 'both'), ('inst', 'inst', 'im.id_formal = any(:inst)', 'both'), ('inst-parent', 'inst_parent', 'icin.id_formal = any(:inst_parent)', 'both'), ('subject', 'subject', 'im.id_sub = any(:subject)', 'both'), ('sample', 'sample', 'im.id_sam = any(:sample)', 'both'), + ('desc-cat', 'desc_cat', 'cd.label = any(:desc_cat)', 'cat'), + ('value-cat', 'value_cat', 'ct.label = any(:value_cat)', 'cat'), ('value-cat-open', 'value_cat_open', 'cv.value_open = any(:value_cat_open)', 'cat'), + ('unit', 'unit', 'u.label = any(:unit)', 'quant'), ('aspect', 'aspect', 'ain.label = any(:aspect)', 'quant'), ('agg-type', 'agg_type', 'qd.aggregation_type = :agg_type', 'quant'), # TODO shape + ('value-quant', 'value_quant', 'qv.value = :value_quant', 'quant'), - ( - 'value-quant-margin', - 'value_quant_margin', - 'qv.value <= :value_quant + :value_quant_margin AND qv.value >= :value_quant - :value_quant_margin', - 'quant', - ), + ('value-quant-margin', 'value_quant_margin', 'qv.value <= :value_quant + :value_quant_margin AND qv.value >= :value_quant - :value_quant_margin', 'quant'), ('value-quant-min', 'value_quant_min', 'qv.value >= :value_quant_min', 'quant'), ('value-quant-max', 'value_quant_max', 'qv.value <= :value_quant_max', 'quant'), ) @@ -77,7 +68,9 @@ def get_where(kwargs): _where_cat.append(w) elif t == 'quant': # do not include value-quant if value-quant-margin is provided - if u == 'value-quant' and 'value-quant-margin' in kwargs and kwargs['value-quant-margin']: + if (u == 'value-quant' and + 'value-quant-margin' in kwargs and + kwargs['value-quant-margin']): continue else: _where_quant.append(w) @@ -141,9 +134,8 @@ def main_query(endpoint, kwargs): 'cd.label AS pred_or_asp, ' 'cv.value_open AS vo_or_unit, ' 'ct.label AS value_controlled, ' - 'NULL::numeric AS value' - ), - ( + 'NULL::numeric AS value') + , ( "'value-quant' AS type, im.dataset, " 'im.id_formal AS inst, id.label AS desc_inst, ' 'NULL AS domain, ' @@ -152,10 +144,18 @@ def main_query(endpoint, kwargs): 'a.label AS aspect, ' 'u.label AS unit, ' 'NULL AS vc, qv.value' - ), + )), + 'desc/inst': ( + 'id.iri, ' + 'id.label, ' + 'idpar.label as subclassof ' + ), + 'desc/cat': ( + 'cd.label, ' + 'cdid.label AS domain, ' + 'cd.range, ' + 'cd.description ' ), - 'desc/inst': ('id.iri, ' 'id.label, ' 'idpar.label as subclassof '), - 'desc/cat': ('cd.label, ' 'cdid.label AS domain, ' 'cd.range, ' 'cd.description '), 'desc/quant': ( 'qd.label, ' 'id.label AS domain, ' @@ -165,29 +165,33 @@ def main_query(endpoint, kwargs): 'u.label AS unit, ' 'qd.description ' ), - 'terms': ('ct.iri, ' 'ct.label '), - 'units': ('u.iri, ' 'u.label '), - 'aspects': ('a.iri, ' 'a.label, ' 'aspar.label as subclassof '), + 'terms': ( + 'ct.iri, ' + 'ct.label ' + ), + 'units': ( + 'u.iri, ' + 'u.label ' + ), + 'aspects': ( + 'a.iri, ' + 'a.label, ' + 'aspar.label as subclassof ' + ), }[endpoint] # FIXME move extra and select out and pass then in in as arguments ? or retain control here? - def gkw(k): - return k in kwargs and kwargs[k] + def gkw(k): return k in kwargs and kwargs[k] class sn: # select needs objects = endpoint == 'objects' - desc_inst = endpoint not in ( - 'objects', - 'terms', - 'units', - 'aspects', - ) + desc_inst = endpoint not in ('objects', 'terms', 'units', 'aspects',) desc_cat = endpoint in ('values/cat', 'values/cat-quant', 'desc/cat') value_cat = endpoint in ('values/cat', 'values/cat-quant', 'terms') aspect = endpoint in ('values/quant', 'values/cat-quant', 'desc/quant', 'aspects') unit = endpoint in ('values/quant', 'values/cat-quant', 'desc/quant', 'units') agg_type = endpoint in ('values/quant', 'values/cat-quant') - desc_quant = aspect or unit or agg_type + desc_quant = (aspect or unit or agg_type) parent_aspect = endpoint == 'aspects' parent_desc_inst = endpoint == 'desc/inst' @@ -201,7 +205,7 @@ class kw: # keywords aspect = gkw('aspect') unit = gkw('unit') agg_type = gkw('agg-type') - desc_quant = aspect or unit or agg_type + desc_quant = (aspect or unit or agg_type) q_par_desc_inst = """ JOIN descriptors_inst AS idstart ON idstart.id = {join_to}.desc_inst @@ -241,8 +245,7 @@ class kw: # keywords adc.addr_type as prov_value_addr_type, adc.addr_field as prov_value_addr_field, adc.value_type as prov_value_type -""" + ( - """, +""" + (""", NULL::address_type as prov_unit_addr_type, NULL as prov_unit_addr_field, NULL::field_value_type as prov_unit_type, @@ -250,10 +253,7 @@ class kw: # keywords NULL::address_type as prov_aspect_addr_type, NULL as prov_aspect_addr_field, NULL::field_value_type as prov_aspect_type -""" - if sn.unit or endpoint == 'values/inst' - else '' - ) +""" if sn.unit or endpoint == 'values/inst' else '') s_prov_q = """ adq.addr_type as prov_value_addr_type, @@ -287,156 +287,95 @@ class kw: # keywords LEFT OUTER JOIN addresses AS ada ON ada.id = odq.addr_aspect """ - maybe_distinct = ( - 'DISTINCT ' - if ( - endpoint.startswith('desc/') - or endpoint in ('terms', 'units', 'aspects') - or (sn.objects or kw.prov) - and not kw.source_only - ) - else '' - ) + maybe_distinct = 'DISTINCT ' if ( + endpoint.startswith('desc/') or + endpoint in ('terms', 'units', 'aspects') or + (sn.objects or kw.prov) and not kw.source_only) else '' ep_select_cat, ep_select_quant = ep_select if isinstance(ep_select, tuple) else (ep_select, ep_select) select_cat = f'SELECT {maybe_distinct}{ep_select_cat}' + ( - (s_prov_objs + s_prov_i + ((',\n' + s_prov_c) if endpoint != 'values/inst' else '')) if kw.prov else '' - ) + (s_prov_objs + s_prov_i + ((',\n' + s_prov_c) if endpoint != 'values/inst' else '')) if kw.prov else '') select_quant = f'SELECT {maybe_distinct}{ep_select_quant}' + ( - (s_prov_objs + s_prov_i + ((',\n' + s_prov_q) if endpoint != 'values/inst' else '')) if kw.prov else '' - ) + (s_prov_objs + s_prov_i + ((',\n' + s_prov_q) if endpoint != 'values/inst' else '')) if kw.prov else '') _where_cat, _where_quant, params = get_where(kwargs) where_cat = f'WHERE {_where_cat}' if _where_cat else '' where_quant = f'WHERE {_where_quant}' if _where_quant else '' - q_inst_parent = ( - '\n'.join( - ( - 'JOIN values_inst AS icin', - 'CROSS JOIN LATERAL get_child_closed_inst(icin.id) AS ic ON im.id = ic.child', - ) - ) - if kw.parent_inst - else '' - ) + q_inst_parent = '\n'.join(( + 'JOIN values_inst AS icin', + 'CROSS JOIN LATERAL get_child_closed_inst(icin.id) AS ic ON im.id = ic.child', + )) if kw.parent_inst else '' # FIXME even trying to be smart here about which joins to pull just papers over the underlying perf issue # shaves about 140ms off but the underlying issue remains - q_cat = '\n'.join( - ( - 'FROM values_cat AS cv', - '\n'.join( - ( - 'JOIN descriptors_inst AS idin', - 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON cv.desc_inst = idc.child -- FIXME', - ) - ) - if kw.desc_inst - else '', - ( - q_par_desc_inst.format(join_to='cv') - if sn.parent_desc_inst - else 'JOIN descriptors_inst AS id ON cv.desc_inst = id.id' - ) - if sn.desc_inst or kw.desc_inst - else '', # FIXME handle parents case - 'JOIN values_inst AS im ON cv.instance = im.id', - q_inst_parent, - '\n'.join( - ( - 'JOIN descriptors_cat AS cd ON cv.desc_cat = cd.id', - 'LEFT OUTER JOIN descriptors_inst AS cdid ON cd.domain = cdid.id -- XXX TODO mismach', - ) - ) - if sn.desc_cat or kw.desc_cat - else '', - 'LEFT OUTER JOIN controlled_terms AS ct ON cv.value_controlled = ct.id' - if sn.value_cat or kw.value_cat - else '', - ( - ( - '\n' - 'JOIN objects AS o ON cv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi\n' - 'ON oi.id = o.id\n' - ) - if kw.source_only - else ( - '\n' # have to use LEFT OUTER because object might have only one of cat or quant - 'LEFT OUTER JOIN values_quant AS qv ON qv.instance = im.id\n' - 'JOIN objects AS o ON cv.object = o.id OR qv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi\n' - 'ON oi.id = o.id\n' - ) - ) - if sn.objects or kw.prov - else '', - (q_prov_i + q_prov_c) if kw.prov else '', - ) - ) - - q_quant = '\n'.join( - ( - 'FROM values_quant AS qv', - '\n'.join( - ( - 'JOIN descriptors_inst AS idin', - 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON qv.desc_inst = idc.child -- FIXME', - ) - ) - if kw.desc_inst - else '', - ( - q_par_desc_inst.format(join_to='qv') - if sn.parent_desc_inst - else 'JOIN descriptors_inst AS id ON qv.desc_inst = id.id' - ) - if sn.desc_inst or kw.desc_inst - else '', # FIXME handle parents case - 'JOIN values_inst AS im ON qv.instance = im.id', - q_inst_parent, - 'JOIN descriptors_quant AS qd ON qv.desc_quant = qd.id' if (sn.desc_quant or kw.desc_quant) else '', - '\n'.join( - ( - 'JOIN aspects AS ain', - 'CROSS JOIN LATERAL get_child_closed_aspect(ain.id) AS ac ON qd.aspect = ac.child', - 'JOIN aspects AS a ON ac.child = a.id', - ) - ) - if kw.aspect - else ( - (q_par_aspect if sn.parent_aspect else 'JOIN aspects AS a ON qd.aspect = a.id') if sn.aspect else '' - ), # FIXME handle parents case - 'LEFT OUTER JOIN units AS u ON qd.unit = u.id' if sn.unit or kw.unit else '', - ( - ( - '\n' - 'JOIN objects AS o ON qv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n' - ) - if kw.source_only - else ( - '\n' # have to use LEFT OUTER because object might have only one of cat or quant - 'LEFT OUTER JOIN values_cat AS cv ON cv.instance = im.id\n' - 'JOIN objects AS o ON qv.object = o.id OR cv.object = o.id\n' - 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n' - ) - ) - if sn.objects or kw.prov - else '', - (q_prov_i + q_prov_q) if kw.prov else '', - ) - ) + q_cat = '\n'.join(( + 'FROM values_cat AS cv', + '\n'.join(( + 'JOIN descriptors_inst AS idin', + 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON cv.desc_inst = idc.child -- FIXME', + )) if kw.desc_inst else '', + (q_par_desc_inst.format(join_to='cv') if sn.parent_desc_inst else + 'JOIN descriptors_inst AS id ON cv.desc_inst = id.id' + ) if sn.desc_inst or kw.desc_inst else '', # FIXME handle parents case + 'JOIN values_inst AS im ON cv.instance = im.id', + q_inst_parent, + '\n'.join(( + 'JOIN descriptors_cat AS cd ON cv.desc_cat = cd.id', + 'LEFT OUTER JOIN descriptors_inst AS cdid ON cd.domain = cdid.id -- XXX TODO mismach', + )) if sn.desc_cat or kw.desc_cat else '', + 'LEFT OUTER JOIN controlled_terms AS ct ON cv.value_controlled = ct.id' if sn.value_cat or kw.value_cat else '', + (('\n' + 'JOIN objects AS o ON cv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi\n' + 'ON oi.id = o.id\n') + if kw.source_only else + ('\n' # have to use LEFT OUTER because object might have only one of cat or quant + 'LEFT OUTER JOIN values_quant AS qv ON qv.instance = im.id\n' + 'JOIN objects AS o ON cv.object = o.id OR qv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi\n' + 'ON oi.id = o.id\n') + ) if sn.objects or kw.prov else '', + (q_prov_i + q_prov_c) if kw.prov else '', + )) + + q_quant = '\n'.join(( + 'FROM values_quant AS qv', + '\n'.join(( + 'JOIN descriptors_inst AS idin', + 'CROSS JOIN LATERAL get_child_closed_desc_inst(idin.id) AS idc ON qv.desc_inst = idc.child -- FIXME', + )) if kw.desc_inst else '', + (q_par_desc_inst.format(join_to='qv') if sn.parent_desc_inst else + 'JOIN descriptors_inst AS id ON qv.desc_inst = id.id' + ) if sn.desc_inst or kw.desc_inst else '', # FIXME handle parents case + 'JOIN values_inst AS im ON qv.instance = im.id', + q_inst_parent, + 'JOIN descriptors_quant AS qd ON qv.desc_quant = qd.id' if ( + sn.desc_quant or kw.desc_quant) else '', + '\n'.join(( + 'JOIN aspects AS ain', + 'CROSS JOIN LATERAL get_child_closed_aspect(ain.id) AS ac ON qd.aspect = ac.child', + 'JOIN aspects AS a ON ac.child = a.id', + )) if kw.aspect else ( + (q_par_aspect if sn.parent_aspect else + 'JOIN aspects AS a ON qd.aspect = a.id' + ) if sn.aspect else ''), # FIXME handle parents case + 'LEFT OUTER JOIN units AS u ON qd.unit = u.id' if sn.unit or kw.unit else '', + (('\n' + 'JOIN objects AS o ON qv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n') + if kw.source_only else + ('\n' # have to use LEFT OUTER because object might have only one of cat or quant + 'LEFT OUTER JOIN values_cat AS cv ON cv.instance = im.id\n' + 'JOIN objects AS o ON qv.object = o.id OR cv.object = o.id\n' + 'LEFT OUTER JOIN objects_internal AS oi ON oi.id = o.id\n') + ) if sn.objects or kw.prov else '', + (q_prov_i + q_prov_q) if kw.prov else '', + )) sw_cat = f'{select_cat}\n{q_cat}\n{where_cat}' # XXX yes this can be malformed in some cases sw_quant = f'{select_quant}\n{q_quant}\n{where_quant}' # XXX yes this can be malformed in some cases if endpoint in ('values/cat', 'terms', 'desc/cat'): query = sw_cat - elif endpoint in ( - 'values/quant', - 'units', - 'aspects', - 'desc/quant', - ): # FIXME TODO make it possible to cross query terms, units, aspects + elif endpoint in ('values/quant', 'units', 'aspects', 'desc/quant'): # FIXME TODO make it possible to cross query terms, units, aspects query = sw_quant else: operator = 'UNION' if 'union-cat-quant' in kwargs and kwargs['union-cat-quant'] else 'INTERSECT' @@ -450,18 +389,13 @@ def to_json(record_type, res, prov=False): rows = list(res) if rows: if record_type == 'object': - result = [ - ( - {k: v for k, v in r._asdict().items() if k != 'id'} - # do not leak internal ids because the might change and are not meaningful - if r.id_type == 'quantdb' - else {k: v for k, v in r._asdict().items() if k != 'updated_transitive'} - ) - for r in rows - ] + result = [{k: v for k, v in r._asdict().items() if k != 'id'} + # do not leak internal ids because the might change and are not meaningful + if r.id_type == 'quantdb' else + {k: v for k, v in r._asdict().items() if k != 'updated_transitive'} + for r in rows] elif record_type is None and 'type' in rows[0]._fields: rem_cat = 'value', 'agg_type' - def type_fields_cat(k): if k == 'pred_or_asp': return 'desc_cat' @@ -471,7 +405,6 @@ def type_fields_cat(k): return k rem_quant = 'domain', 'range', 'value_controlled' - def type_fields_quant(k): if k == 'pred_or_asp': return 'aspect' @@ -503,14 +436,9 @@ def prow(r): r.pop(cull_none) if prov: - def pop_prefix(d, prefix): usc = prefix.count('_') - return { - k.split('_', 1 + usc)[-1]: v - for k in list(d) - if k.startswith(prefix + '_') and (v := d.pop(k)) is not None - } + return {k.split('_', 1 + usc)[-1]:v for k in list(d) if k.startswith(prefix + '_') and (v := d.pop(k)) is not None} for r in result: provs = pop_prefix(r, 'prov') @@ -529,7 +457,7 @@ def pop_prefix(d, prefix): r['prov'] = provs out = result - # breakpoint() + #breakpoint() else: out = [] @@ -556,8 +484,10 @@ def wrap_out(endpoint, kwargs, out): args_default = { 'object': [], 'updated-transitive': None, # TODO needed to query for some internal + ## inst 'desc-inst': [], # aka class + # value-inst 'dataset': None, 'inst': [], @@ -565,31 +495,38 @@ def wrap_out(endpoint, kwargs, out): 'subject': [], 'sample': [], 'include-equivalent': False, + ## cat 'desc-cat': [], # aka predicate + 'value-cat': [], 'value-cat-open': [], + ## quant # desc-quant 'unit': [], 'aspect': [], 'agg-type': None, # TODO shape + 'value-quant': None, 'value-quant-margin': None, 'value-quant-min': None, 'value-quant-max': None, + 'limit': 100, #'operator': 'INTERSECT', # XXX ... 'union-cat-quant': False, # by default we intersect but sometimes you want the union instead e.g. if object is passed 'source-only': False, 'include-unused': False, 'prov': False, + #'cat-value': [], #'class': [], #'predicate': None, #'object': None, #'filter': [], + #'quant-value': None, #'quant-margin': None, #'quant-min': None, @@ -645,9 +582,7 @@ def getArgs(request, endpoint, dev=False): def convert(k, d): if k in request.args: # arity is determined here - if k in ('dataset', 'include-equivalent', 'union-cat-quant', 'include-unused', 'agg-type') or k.startswith( - 'value-quant' - ): + if k in ('dataset', 'include-equivalent', 'union-cat-quant', 'include-unused', 'agg-type') or k.startswith('value-quant'): v = request.args[k] if k in ('dataset',): if not v: @@ -692,13 +627,14 @@ def convert(k, d): else: return v - out = {k: convert(k, v) for k, v in default.items()} + out = {k:convert(k, v) for k, v in default.items()} return out def make_app(db=None, name='quantdb-api-server', dev=False): app = Flask(name) - kwargs = {k: auth.get(f'db-{k}') for k in ('user', 'host', 'port', 'database')} # TODO integrate with cli options + kwargs = {k:auth.get(f'db-{k}') # TODO integrate with cli options + for k in ('user', 'host', 'port', 'database')} kwargs['dbuser'] = kwargs.pop('user') app.config['SQLALCHEMY_DATABASE_URI'] = dbUri(**kwargs) # use os.environ.update app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False @@ -716,8 +652,7 @@ def default_flow(endpoint, record_type, query_fun, json_fun, alt_query_fun=None) breakpoint() raise e - def gkw(k): - return k in kwargs and kwargs[k] + def gkw(k): return k in kwargs and kwargs[k] if gkw('include-unused'): query_fun = alt_query_fun @@ -730,24 +665,19 @@ def gkw(k): raise e if gkw('return-query'): - # from psycopg2cffi._impl.cursor import _combine_cmd_params # this was an absolute pita to track down - # stq = sql_text(query) - # stq = stq.bindparams(**params) - # conn = session.connection() - # cur = conn.engine.raw_connection().cursor() - # cq, cp, _ = stq._compile_w_cache(dialect=conn.dialect, compiled_cache=conn.engine._compiled_cache, column_keys=sorted(params)) - # almost = str(stq.compile(dialect=conn.dialect,)) #compile_kwargs={'literal_binds': True}, - # wat = _combine_cmd_params(str(cq), params, cur.connection) + #from psycopg2cffi._impl.cursor import _combine_cmd_params # this was an absolute pita to track down + #stq = sql_text(query) + #stq = stq.bindparams(**params) + #conn = session.connection() + #cur = conn.engine.raw_connection().cursor() + #cq, cp, _ = stq._compile_w_cache(dialect=conn.dialect, compiled_cache=conn.engine._compiled_cache, column_keys=sorted(params)) + #almost = str(stq.compile(dialect=conn.dialect,)) #compile_kwargs={'literal_binds': True}, + #wat = _combine_cmd_params(str(cq), params, cur.connection) ord_params = {k: v for k, v in sorted(params.items())} ARRAY = 'ARRAY' ccuuid = '::uuid' - org_vars = ' '.join( - [ - f':var {key}="{ARRAY + repr(value) if isinstance(value, list) else (repr(str(value)) + ccuuid if isinstance(value, uuid.UUID) else repr(value))}"' - for key, value in ord_params.items() - ] - ) - return f""" SQL query expansion for quantdb @@ -761,7 +691,7 @@ def gkw(k): {query} -""" +''' try: res = session.execute(sql_text(query), params) @@ -778,14 +708,9 @@ def gkw(k): return resp - @app.route(f'{bp}/test') - def route_test_check(): - """objects with derived values that match all criteria""" - return 'testing-api' - @app.route(f'{bp}/objects') def route_1_objects(): - """objects with derived values that match all criteria""" + "objects with derived values that match all criteria" return default_flow('objects', 'object', main_query, to_json) @app.route(f'{bp}/desc/inst') @@ -793,15 +718,17 @@ def route_1_objects(): @app.route(f'{bp}/classes') def route_1_desc_inst(): def query(endpoint, kwargs): - return ( - 'SELECT ' - 'id.iri, ' - 'id.label, ' - 'idpar.label as subclassof' - 'FROM descriptors_inst AS id' - 'LEFT OUTER JOIN class_parent AS clp ON clp.id = id.id' - 'LEFT OUTER JOIN descriptors_inst AS idpar ON idpar.id = clp.parent' - ), {} + return ('SELECT ' + + 'id.iri, ' + 'id.label, ' + 'idpar.label as subclassof' + + """ +FROM descriptors_inst AS id +LEFT OUTER JOIN class_parent AS clp ON clp.id = id.id +LEFT OUTER JOIN descriptors_inst AS idpar ON idpar.id = clp.parent +"""), {} return default_flow('desc/inst', 'desc-inst', main_query, to_json, alt_query_fun=query) @@ -810,47 +737,45 @@ def query(endpoint, kwargs): @app.route(f'{bp}/predicates') def route_1_desc_cat(): def query(endpoint, kwargs): - return ( - 'select ' - 'cd.label, ' - 'cdid.label AS domain, ' - 'cd.range, ' - 'cd.description ' - 'from descriptors_cat as cd ' - 'left outer join descriptors_inst as cdid on cdid.id = cd.domain' - ), {} + return ('select ' - return default_flow( - 'desc/cat', 'desc-cat', main_query, to_json, alt_query_fun=query - ) # TODO likely need different args e.g. to filter by desc_inst + 'cd.label, ' + 'cdid.label AS domain, ' + 'cd.range, ' + 'cd.description ' + + 'from descriptors_cat as cd ' + 'left outer join descriptors_inst as cdid on cdid.id = cd.domain' + ), {} + + return default_flow('desc/cat', 'desc-cat', main_query, to_json, alt_query_fun=query) # TODO likely need different args e.g. to filter by desc_inst @app.route(f'{bp}/desc/quant') @app.route(f'{bp}/descriptors/quant') def route_1_desc_quant(): def query(endpoint, kwargs): - return ( - 'select ' - 'qd.label, ' - 'id.label AS domain, ' - 'qd.shape, ' - 'qd.aggregation_type as agg_type, ' - 'a.label AS aspect, ' - 'u.label AS unit, ' - 'qd.description ' - 'from descriptors_quant as qd ' - 'left outer join descriptors_inst as id on id.id = qd.domain ' - 'left outer join units as u on u.id = qd.unit ' - 'join aspects as a on a.id = qd.aspect' - ), {} + return ('select ' - return default_flow( - 'desc/quant', 'desc-quant', main_query, to_json, alt_query_fun=query - ) # TODO likely need different args e.g. to filter by desc_inst + 'qd.label, ' + 'id.label AS domain, ' + 'qd.shape, ' + 'qd.aggregation_type as agg_type, ' + 'a.label AS aspect, ' + 'u.label AS unit, ' + 'qd.description ' + + 'from descriptors_quant as qd ' + 'left outer join descriptors_inst as id on id.id = qd.domain ' + 'left outer join units as u on u.id = qd.unit ' + 'join aspects as a on a.id = qd.aspect' + ), {} + + return default_flow('desc/quant', 'desc-quant', main_query, to_json, alt_query_fun=query) # TODO likely need different args e.g. to filter by desc_inst @app.route(f'{bp}/values/inst') @app.route(f'{bp}/instances') def route_1_val_inst(): - """instances associated with values that match all critiera""" + "instances associated with values that match all critiera" return default_flow('values/inst', 'instance', main_query, to_json) @app.route(f'{bp}/values') @@ -870,34 +795,41 @@ def route_1_val_quant(): @app.route(f'{bp}/controlled-terms') def route_1_cterms(): def query(endpoint, kwargs): - return ('select ' 'ct.iri, ' 'ct.label ' 'from controlled_terms as ct'), {} + return ('select ' + + 'ct.iri, ' + 'ct.label ' + + 'from controlled_terms as ct'), {} return default_flow('terms', 'term', main_query, to_json, alt_query_fun=query) @app.route(f'{bp}/units') def route_1_units(): def query(endpoint, kwargs): - return ('select ' 'u.iri, ' 'u.label ' 'from units as u'), {} + return ('select ' + + 'u.iri, ' + 'u.label ' + + 'from units as u'), {} return default_flow('units', 'unit', main_query, to_json, alt_query_fun=query) @app.route(f'{bp}/aspects') def route_1_aspects(): def query(endpoint, kwargs): - return ( - ( - 'SELECT ' + return ('SELECT ' + 'a.iri, ' 'a.label, ' 'aspar.label AS subclassof ' + """ FROM aspects AS a LEFT OUTER JOIN aspect_parent AS ap ON ap.id = a.id LEFT OUTER JOIN aspects AS aspar ON aspar.id = ap.parent -""" - ), - {}, - ) +"""), {} return default_flow('aspects', 'aspect', main_query, to_json, alt_query_fun=query) From dea583e42613109d6b877af209deea65aa9213a0 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Tue, 4 Feb 2025 13:04:36 -0800 Subject: [PATCH 3/3] ignoring api.py for blue fmt due to sql str --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d269415..65545af 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,4 +18,4 @@ repos: hooks: - id: blue args: ["-l", "120"] - exclude: quantdb/api.py + exclude: api.py