From f90f42b5d67c71a25dc205bedcebdfc5360388d1 Mon Sep 17 00:00:00 2001 From: Oscar Nydza Date: Mon, 11 Mar 2024 11:56:33 +0100 Subject: [PATCH 1/2] Feature/pandas api isin (#30) code, test and documentation of isin --------- Co-authored-by: marcosvm13 Co-authored-by: cperezln --- docs/user-guide/advanced/Pandas_API.ipynb | 99 +++++++++++++++++++++++ src/pykx/pandas_api/pandas_meta.py | 47 +++++++++++ tests/test_pandas_api.py | 24 ++++++ 3 files changed, 170 insertions(+) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index ee56ed9..91fd816 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -2659,6 +2659,105 @@ "Example Table." ] }, + { + "cell_type": "markdown", + "id": "7f08eb84", + "metadata": {}, + "source": [ + "## Comparison\n", + "\n", + "### Table.isin()\n", + "\n", + "```\n", + "Table.isin(\n", + " values\n", + ")\n", + "```\n", + "\n", + "Whether each element in the DataFrame is contained in values.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n", + "| values | Union[List, dict, Table, KeyedTable] | The result will only be true at a location if all the labels match. If values is a dict, the keys must be the column names, which must match. If values is a Table or KeyedTable, then both the index and column labels must match. | None|\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :-----------------------: | :---------------------------------------------- |\n", + "| Table | Boolean type Table/KeyedTable showing whether each element in the DataFrame is contained in values.|\n", + "\n", + "**Examples:**\n", + "\n", + "Example Table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e453c8", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data={'x': list(range(3)), 'y': [\"A\", \"B\", \"C\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "aadd23c1", + "metadata": {}, + "source": [ + "Find if element \"A\" or \"1\" is in the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d41d40e0", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin([\"A\", 1])" + ] + }, + { + "cell_type": "markdown", + "id": "cff856fe", + "metadata": {}, + "source": [ + "Find if element \"A\" is in colum \"y\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bccf59d9", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin({\"y\": [\"A\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "ed704cce", + "metadata": {}, + "source": [ + "Find if element \"A\" is in the first position of \"y\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41840cc0", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin(kx.Table(data={\"y\":[\"A\"]}))" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 659a26b..53e4045 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -243,6 +243,53 @@ def abs(self, numeric_only=False): tab = _get_numeric_only_subtable(self) return q.abs(tab) + @api_return + def isin(self, values): + tab = self + key_table = 'KeyedTable' in str(type(tab)) + key_value = 'KeyedTable' in str(type(values)) + n_rows = 0 + false_dataframe_f = q("""{u:(cols x); + v:(count[u],count[x])#0b; + flip u!v}""") + if key_value and not key_table: + return false_dataframe_f(tab) + if key_table: + kcols = q.key(tab) + if key_value: + n_rows, tab = q("""{n_rows:max 0, count[x]- + count rows:(key y) inter key x; + (n_rows; x each rows)}""", tab, values) + values = q.value(values) + else: + tab = q.value(tab) + dic_value, is_tab = q("""{$[98h = type x; + (flip x; 1b); + (x; 0b)]}""", values) + if key_table and not key_value and is_tab: + ftable = false_dataframe_f(tab) + else: + ftable = q("""{ [table; values; is_tab; n_rows] + flip (cols table)! + {[col_name; tab; values; v_is_tab; n_rows] + col: tab col_name; + ltype: .Q.ty col; + values: $[99h~type values; values col_name; values]; + $[v_is_tab or ltype=" "; ; + values@:where (lower ltype) = .Q.t abs type each values]; + $[0 = count values; + (n_rows + count[col])#0b; + $[v_is_tab; + $[any ltype = (" ";"C"); ~'; =] + [mlen#col;mlen#values], + (n_rows + max 0,count[col]- + mlen: min count[values], + count[col])#0b; + any $[any ltype = (" ";"C"); ~/:\:; =\:][values;col] + ]]}[; table; values; is_tab; n_rows] + each cols table}""", tab, dic_value, is_tab, n_rows) + return ftable.set_index(kcols) if key_table else ftable + @convert_result def all(self, axis=0, bool_only=False, skipna=True): res, cols = preparse_computations(self, axis, skipna, bool_only=bool_only) diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 6cae5a8..312cef1 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2109,6 +2109,30 @@ def test_keyed_loc_fixes(q): mkt['k1'] +def test_pandas_isin(kx): + tab = kx.q("""([] k1: 0n 1. 0n 2. 0n; + k2: ("A";" ";"B";" ";"A"); + k3: (`a;1.;`c;5;`d))""") + keyed_tab = kx.q("""([`a`b`c`d`e] + k1: 0n 1. 0n 2. 0n; + k2: ("A";" ";"B";" ";"A"); + k3: (`a;1.;`c;5;`d))""") + + list_value = kx.q('(`a;1.;"A")') + tab_value = kx.q('([] k1: 1. 2. 3.; k2: ("A";"B";"C"))') + dict_value = {"k1": [1., 2., 3.]} + keyed_tab_value = kx.q('([`a`b] k1: 1. 2.; k2: ("A";"B"))') + + assert tab.isin(list_value).pd().equals(tab.pd().isin(list_value.py())) + assert tab.isin(tab_value).pd().equals(tab.pd().isin(tab_value.pd())) + assert tab.isin(dict_value).pd().equals(tab.pd().isin(dict_value)) + assert tab.isin(keyed_tab_value).pd().equals(tab.pd().isin(keyed_tab_value)) + assert keyed_tab.isin(list_value).pd().equals(keyed_tab.pd().isin(list_value.py())) + assert keyed_tab.isin(dict_value).pd().equals(keyed_tab.pd().isin(dict_value)) + assert keyed_tab.isin(keyed_tab_value).pd().equals(keyed_tab.pd().isin(keyed_tab_value.pd())) + assert keyed_tab.isin(tab_value).pd().equals(keyed_tab.pd().isin(tab_value)) + + def test_pandas_count(q): tab = q('([] k1: 0n 2 0n 2 0n ; k2: (`a;`;`b;`;`c))') df = tab.pd() From 7c061343e8c23f1bb5d8d11cb359209fb941c90b Mon Sep 17 00:00:00 2001 From: Oscar Nydza Date: Mon, 11 Mar 2024 11:58:15 +0100 Subject: [PATCH 2/2] Reimplemented isin function --- src/pykx/pandas_api/pandas_meta.py | 105 ++++++++++++++++++----------- tests/test_pandas_api.py | 48 ++++++++++--- 2 files changed, 103 insertions(+), 50 deletions(-) diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 53e4045..c5b6039 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -246,49 +246,72 @@ def abs(self, numeric_only=False): @api_return def isin(self, values): tab = self - key_table = 'KeyedTable' in str(type(tab)) - key_value = 'KeyedTable' in str(type(values)) - n_rows = 0 - false_dataframe_f = q("""{u:(cols x); + + keyed_in = 'KeyedTable' in str(type(tab)) + keyed_val = 'KeyedTable' in str(type(values)) + + false_table = q("""{u:$[99h~type x;cols value x;cols x]; v:(count[u],count[x])#0b; - flip u!v}""") - if key_value and not key_table: - return false_dataframe_f(tab) - if key_table: - kcols = q.key(tab) - if key_value: - n_rows, tab = q("""{n_rows:max 0, count[x]- - count rows:(key y) inter key x; - (n_rows; x each rows)}""", tab, values) - values = q.value(values) - else: - tab = q.value(tab) - dic_value, is_tab = q("""{$[98h = type x; - (flip x; 1b); - (x; 0b)]}""", values) - if key_table and not key_value and is_tab: - ftable = false_dataframe_f(tab) + t:flip u!v; + $[99h~type x;key[x]!t;t]}""", tab) + + isin_tab = q('''{[it;vt;ft] + idxt:raze value flip key it; + colt:1_cols it; + idxv:raze value flip key vt; + colv:1_cols vt; + p:(idxt inter idxv) cross colt inter colv; + cv:{[k1;k2;ti;tv] + enlist[ti[k1][k2]] in enlist[tv[k1][k2]]}[;;it;vt]; + vals:flip `x`field`values!flip[p],flip cv .' p; + aux:exec ((`$string field)!values) by x:x from vals; + aux or ft}''') + + gen_idx = q('{flip enlist[`x]!enlist til x}') + + # list (PyKX and Python) + if "list" in str(type(values)).lower(): + return q('{x in y}', tab, values) + # table + elif q('{98h~type x}', values): + if keyed_in != keyed_val: + return false_table + + idx_tab = gen_idx(len(tab)) + idx_values = gen_idx(len(values)) + return q.value(isin_tab(tab.set_index(idx_tab), + values.set_index(idx_values), + false_table.set_index(idx_tab))) + # keyed table + elif keyed_val: + if keyed_in != keyed_val or len(q.key(tab).columns) != len(q.key(values).columns): + return false_table + + old_idx_tab = q.key(tab) + idx_tab = gen_idx(len(tab)) + + idx_values = q('''{kt:flip value flip key x; + kv:flip value flip key y; + flip enlist[`x]!enlist count[kv]#kt?inter[kv;kt]}''', tab, values) + + res = q.value(isin_tab(q.value(tab).set_index(idx_tab), + q.value(values).set_index(idx_values), + q.value(false_table).set_index(idx_tab))) + + return res.set_index(old_idx_tab) + # dict + elif isinstance(values, dict) or q('{99h~type x}', values): + return q('''{[t;d] + tv:$[kt:99h~type t;value t;t]; + cd:{[k;t;d] + $[k in key d; + t[k] in d[k]; + count[t]#0b]}[;tv;d]; + r:flip cols[tv]!cd each cols tv; + $[kt;key[t]!r;r]} + ''', tab, values) else: - ftable = q("""{ [table; values; is_tab; n_rows] - flip (cols table)! - {[col_name; tab; values; v_is_tab; n_rows] - col: tab col_name; - ltype: .Q.ty col; - values: $[99h~type values; values col_name; values]; - $[v_is_tab or ltype=" "; ; - values@:where (lower ltype) = .Q.t abs type each values]; - $[0 = count values; - (n_rows + count[col])#0b; - $[v_is_tab; - $[any ltype = (" ";"C"); ~'; =] - [mlen#col;mlen#values], - (n_rows + max 0,count[col]- - mlen: min count[values], - count[col])#0b; - any $[any ltype = (" ";"C"); ~/:\:; =\:][values;col] - ]]}[; table; values; is_tab; n_rows] - each cols table}""", tab, dic_value, is_tab, n_rows) - return ftable.set_index(kcols) if key_table else ftable + raise ValueError("Not a valid argument type.") @convert_result def all(self, axis=0, bool_only=False, skipna=True): diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 312cef1..8d42bd9 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2118,19 +2118,49 @@ def test_pandas_isin(kx): k2: ("A";" ";"B";" ";"A"); k3: (`a;1.;`c;5;`d))""") + multi_keyed_index = kx.q('([]x:0 1 0 1;y:0 0 1 1)') + multi_keyed_table = kx.q('''([]a:`foo`bar`baz`qux; + b:"f"$til 4; + c:reverse "f"$til 4)''').set_index(multi_keyed_index) + list_value = kx.q('(`a;1.;"A")') tab_value = kx.q('([] k1: 1. 2. 3.; k2: ("A";"B";"C"))') dict_value = {"k1": [1., 2., 3.]} keyed_tab_value = kx.q('([`a`b] k1: 1. 2.; k2: ("A";"B"))') - - assert tab.isin(list_value).pd().equals(tab.pd().isin(list_value.py())) - assert tab.isin(tab_value).pd().equals(tab.pd().isin(tab_value.pd())) - assert tab.isin(dict_value).pd().equals(tab.pd().isin(dict_value)) - assert tab.isin(keyed_tab_value).pd().equals(tab.pd().isin(keyed_tab_value)) - assert keyed_tab.isin(list_value).pd().equals(keyed_tab.pd().isin(list_value.py())) - assert keyed_tab.isin(dict_value).pd().equals(keyed_tab.pd().isin(dict_value)) - assert keyed_tab.isin(keyed_tab_value).pd().equals(keyed_tab.pd().isin(keyed_tab_value.pd())) - assert keyed_tab.isin(tab_value).pd().equals(keyed_tab.pd().isin(tab_value)) + multi_keyed_value_index = kx.q('([]x:1 1 0;y:0 1 0)') + multi_keyed_value = kx.q('''([]a:`bar`foo`foo; + b:0 1 0)''').set_index(multi_keyed_value_index) + + pd.testing.assert_frame_equal(tab.isin(list_value).pd(), + tab.pd().isin(list_value.py())) + pd.testing.assert_frame_equal(tab.isin(dict_value).pd(), + tab.pd().isin(dict_value)) + pd.testing.assert_frame_equal(tab.isin(tab_value).pd(), + tab.pd().isin(tab_value.pd())) + pd.testing.assert_frame_equal(tab.isin(keyed_tab_value).pd(), + tab.pd().isin(keyed_tab_value.pd())) + + pd.testing.assert_frame_equal(keyed_tab.isin(list_value).pd(), + keyed_tab.pd().isin(list_value.py())) + pd.testing.assert_frame_equal(keyed_tab.isin(dict_value).pd(), + keyed_tab.pd().isin(dict_value)) + pd.testing.assert_frame_equal(keyed_tab.isin(tab_value).pd(), + keyed_tab.pd().isin(tab_value.pd())) + pd.testing.assert_frame_equal(keyed_tab.isin(keyed_tab_value).pd(), + keyed_tab.pd().isin(keyed_tab_value.pd())) + pd.testing.assert_frame_equal(keyed_tab.isin(multi_keyed_value).pd(), + keyed_tab.pd().isin(multi_keyed_value.pd())) + + pd.testing.assert_frame_equal(multi_keyed_table.isin(list_value).pd(), + multi_keyed_table.pd().isin(list_value.py())) + pd.testing.assert_frame_equal(multi_keyed_table.isin(dict_value).pd(), + multi_keyed_table.pd().isin(dict_value)) + pd.testing.assert_frame_equal(multi_keyed_table.isin(tab_value).pd(), + multi_keyed_table.pd().isin(tab_value.pd())) + pd.testing.assert_frame_equal(multi_keyed_table.isin(multi_keyed_value).pd(), + multi_keyed_table.pd().isin(multi_keyed_value.pd())) + pd.testing.assert_frame_equal(multi_keyed_table.isin(keyed_tab_value).pd(), + multi_keyed_table.pd().isin(keyed_tab_value.pd())) def test_pandas_count(q):