From f0e67c1ec8a2bb4db92754613f45d1bab16418ed Mon Sep 17 00:00:00 2001 From: Oscar Nydza Date: Fri, 16 Feb 2024 13:11:31 +0100 Subject: [PATCH] Feature/pandas api isin (#30) code, test and documentation of isin --------- Co-authored-by: marcosvm13 Co-authored-by: cperezln --- docs/user-guide/advanced/Pandas_API.ipynb | 99 +++++++++++++++++++++++ src/pykx/pandas_api/pandas_meta.py | 47 +++++++++++ tests/test_pandas_api.py | 24 ++++++ 3 files changed, 170 insertions(+) diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index c35d64b..a27248e 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -3116,6 +3116,105 @@ "Example Table." ] }, + { + "cell_type": "markdown", + "id": "7f08eb84", + "metadata": {}, + "source": [ + "## Comparison\n", + "\n", + "### Table.isin()\n", + "\n", + "```\n", + "Table.isin(\n", + " values\n", + ")\n", + "```\n", + "\n", + "Whether each element in the DataFrame is contained in values.\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n", + "| values | Union[List, dict, Table, KeyedTable] | The result will only be true at a location if all the labels match. If values is a dict, the keys must be the column names, which must match. If values is a Table or KeyedTable, then both the index and column labels must match. | None|\n", + "\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :-----------------------: | :---------------------------------------------- |\n", + "| Table | Boolean type Table/KeyedTable showing whether each element in the DataFrame is contained in values.|\n", + "\n", + "**Examples:**\n", + "\n", + "Example Table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e453c8", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data={'x': list(range(3)), 'y': [\"A\", \"B\", \"C\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "aadd23c1", + "metadata": {}, + "source": [ + "Find if element \"A\" or \"1\" is in the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d41d40e0", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin([\"A\", 1])" + ] + }, + { + "cell_type": "markdown", + "id": "cff856fe", + "metadata": {}, + "source": [ + "Find if element \"A\" is in colum \"y\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bccf59d9", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin({\"y\": [\"A\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "ed704cce", + "metadata": {}, + "source": [ + "Find if element \"A\" is in the first position of \"y\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41840cc0", + "metadata": {}, + "outputs": [], + "source": [ + "tab.isin(kx.Table(data={\"y\":[\"A\"]}))" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 243c6b7..36bdd69 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -290,6 +290,53 @@ def abs(self, numeric_only=False): tab = _get_numeric_only_subtable(self) return q.abs(tab) + @api_return + def isin(self, values): + tab = self + key_table = 'KeyedTable' in str(type(tab)) + key_value = 'KeyedTable' in str(type(values)) + n_rows = 0 + false_dataframe_f = q("""{u:(cols x); + v:(count[u],count[x])#0b; + flip u!v}""") + if key_value and not key_table: + return false_dataframe_f(tab) + if key_table: + kcols = q.key(tab) + if key_value: + n_rows, tab = q("""{n_rows:max 0, count[x]- + count rows:(key y) inter key x; + (n_rows; x each rows)}""", tab, values) + values = q.value(values) + else: + tab = q.value(tab) + dic_value, is_tab = q("""{$[98h = type x; + (flip x; 1b); + (x; 0b)]}""", values) + if key_table and not key_value and is_tab: + ftable = false_dataframe_f(tab) + else: + ftable = q("""{ [table; values; is_tab; n_rows] + flip (cols table)! + {[col_name; tab; values; v_is_tab; n_rows] + col: tab col_name; + ltype: .Q.ty col; + values: $[99h~type values; values col_name; values]; + $[v_is_tab or ltype=" "; ; + values@:where (lower ltype) = .Q.t abs type each values]; + $[0 = count values; + (n_rows + count[col])#0b; + $[v_is_tab; + $[any ltype = (" ";"C"); ~'; =] + [mlen#col;mlen#values], + (n_rows + max 0,count[col]- + mlen: min count[values], + count[col])#0b; + any $[any ltype = (" ";"C"); ~/:\:; =\:][values;col] + ]]}[; table; values; is_tab; n_rows] + each cols table}""", tab, dic_value, is_tab, n_rows) + return ftable.set_index(kcols) if key_table else ftable + @convert_result def all(self, axis=0, bool_only=False, skipna=True): res, cols, _ = preparse_computations(self, axis, skipna, bool_only=bool_only) diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 80dfb05..8690251 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -2339,6 +2339,30 @@ def test_isnull(q): pd.testing.assert_frame_equal(tab.notnull().pd(), expected_inv) +def test_pandas_isin(kx): + tab = kx.q("""([] k1: 0n 1. 0n 2. 0n; + k2: ("A";" ";"B";" ";"A"); + k3: (`a;1.;`c;5;`d))""") + keyed_tab = kx.q("""([`a`b`c`d`e] + k1: 0n 1. 0n 2. 0n; + k2: ("A";" ";"B";" ";"A"); + k3: (`a;1.;`c;5;`d))""") + + list_value = kx.q('(`a;1.;"A")') + dict_value = {"k1": [1., 2., 3.]} + tab_value = kx.q('([] k1: 1. 2. 3.; k2: ("A";"B";"C"))') + keyed_tab_value = kx.q('([`a`b] k1: 1. 2.; k2: ("A";"B"))') + + assert tab.isin(list_value).pd().equals(tab.pd().isin(list_value.py())) + assert tab.isin(dict_value).pd().equals(tab.pd().isin(dict_value)) + assert tab.isin(tab_value).pd().equals(tab.pd().isin(tab_value.pd())) + assert tab.isin(keyed_tab_value).pd().equals(tab.pd().isin(keyed_tab_value)) + assert keyed_tab.isin(list_value).pd().equals(keyed_tab.pd().isin(list_value.py())) + assert keyed_tab.isin(dict_value).pd().equals(keyed_tab.pd().isin(dict_value)) + assert keyed_tab.isin(keyed_tab_value).pd().equals(keyed_tab.pd().isin(keyed_tab_value.pd())) + assert keyed_tab.isin(tab_value).pd().equals(keyed_tab.pd().isin(tab_value)) + + def test_pandas_count(q): tab = q('([] k1: 0n 2 0n 2 0n ; k2: (`a;`;`b;`;`c))') df = tab.pd()