Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pandas API isin function #36

Open
wants to merge 2 commits into
base: feature/pandas-api-3rd-block
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions docs/user-guide/advanced/Pandas_API.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2659,6 +2659,105 @@
"Example Table."
]
},
{
"cell_type": "markdown",
"id": "7f08eb84",
"metadata": {},
"source": [
"## Comparison\n",
"\n",
"### Table.isin()\n",
"\n",
"```\n",
"Table.isin(\n",
" values\n",
")\n",
"```\n",
"\n",
"Whether each element in the DataFrame is contained in values.\n",
"\n",
"**Parameters:**\n",
"\n",
"| Name | Type | Description | Default |\n",
"| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n",
"| values | Union[List, dict, Table, KeyedTable] | The result will only be true at a location if all the labels match. If values is a dict, the keys must be the column names, which must match. If values is a Table or KeyedTable, then both the index and column labels must match. | None|\n",
"\n",
"\n",
"**Returns:**\n",
"\n",
"| Type | Description |\n",
"| :-----------------------: | :---------------------------------------------- |\n",
"| Table | Boolean type Table/KeyedTable showing whether each element in the DataFrame is contained in values.|\n",
"\n",
"**Examples:**\n",
"\n",
"Example Table."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6e453c8",
"metadata": {},
"outputs": [],
"source": [
"tab = kx.Table(data={'x': list(range(3)), 'y': [\"A\", \"B\", \"C\"]})"
]
},
{
"cell_type": "markdown",
"id": "aadd23c1",
"metadata": {},
"source": [
"Find if element \"A\" or \"1\" is in the table:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d41d40e0",
"metadata": {},
"outputs": [],
"source": [
"tab.isin([\"A\", 1])"
]
},
{
"cell_type": "markdown",
"id": "cff856fe",
"metadata": {},
"source": [
"Find if element \"A\" is in colum \"y\":"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bccf59d9",
"metadata": {},
"outputs": [],
"source": [
"tab.isin({\"y\": [\"A\"]})"
]
},
{
"cell_type": "markdown",
"id": "ed704cce",
"metadata": {},
"source": [
"Find if element \"A\" is in the first position of \"y\" column:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41840cc0",
"metadata": {},
"outputs": [],
"source": [
"tab.isin(kx.Table(data={\"y\":[\"A\"]}))"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
70 changes: 70 additions & 0 deletions src/pykx/pandas_api/pandas_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,76 @@ def abs(self, numeric_only=False):
tab = _get_numeric_only_subtable(self)
return q.abs(tab)

@api_return
def isin(self, values):
tab = self

keyed_in = 'KeyedTable' in str(type(tab))
keyed_val = 'KeyedTable' in str(type(values))

false_table = q("""{u:$[99h~type x;cols value x;cols x];
v:(count[u],count[x])#0b;
t:flip u!v;
$[99h~type x;key[x]!t;t]}""", tab)

isin_tab = q('''{[it;vt;ft]
idxt:raze value flip key it;
colt:1_cols it;
idxv:raze value flip key vt;
colv:1_cols vt;
p:(idxt inter idxv) cross colt inter colv;
cv:{[k1;k2;ti;tv]
enlist[ti[k1][k2]] in enlist[tv[k1][k2]]}[;;it;vt];
vals:flip `x`field`values!flip[p],flip cv .' p;
aux:exec ((`$string field)!values) by x:x from vals;
aux or ft}''')

gen_idx = q('{flip enlist[`x]!enlist til x}')

# list (PyKX and Python)
if "list" in str(type(values)).lower():
return q('{x in y}', tab, values)
# table
elif q('{98h~type x}', values):
if keyed_in != keyed_val:
return false_table

idx_tab = gen_idx(len(tab))
idx_values = gen_idx(len(values))
return q.value(isin_tab(tab.set_index(idx_tab),
values.set_index(idx_values),
false_table.set_index(idx_tab)))
# keyed table
elif keyed_val:
if keyed_in != keyed_val or len(q.key(tab).columns) != len(q.key(values).columns):
return false_table

old_idx_tab = q.key(tab)
idx_tab = gen_idx(len(tab))

idx_values = q('''{kt:flip value flip key x;
kv:flip value flip key y;
flip enlist[`x]!enlist count[kv]#kt?inter[kv;kt]}''', tab, values)

res = q.value(isin_tab(q.value(tab).set_index(idx_tab),
q.value(values).set_index(idx_values),
q.value(false_table).set_index(idx_tab)))

return res.set_index(old_idx_tab)
# dict
elif isinstance(values, dict) or q('{99h~type x}', values):
return q('''{[t;d]
tv:$[kt:99h~type t;value t;t];
cd:{[k;t;d]
$[k in key d;
t[k] in d[k];
count[t]#0b]}[;tv;d];
r:flip cols[tv]!cd each cols tv;
$[kt;key[t]!r;r]}
''', tab, values)
else:
raise ValueError("Not a valid argument type.")

@convert_result
def all(self, axis=0, bool_only=False, skipna=True):
res, cols = preparse_computations(self, axis, skipna, bool_only=bool_only)
Expand Down
54 changes: 54 additions & 0 deletions tests/test_pandas_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,60 @@ def test_keyed_loc_fixes(q):
mkt['k1']


def test_pandas_isin(kx):
tab = kx.q("""([] k1: 0n 1. 0n 2. 0n;
k2: ("A";" ";"B";" ";"A");
k3: (`a;1.;`c;5;`d))""")
keyed_tab = kx.q("""([`a`b`c`d`e]
k1: 0n 1. 0n 2. 0n;
k2: ("A";" ";"B";" ";"A");
k3: (`a;1.;`c;5;`d))""")

multi_keyed_index = kx.q('([]x:0 1 0 1;y:0 0 1 1)')
multi_keyed_table = kx.q('''([]a:`foo`bar`baz`qux;
b:"f"$til 4;
c:reverse "f"$til 4)''').set_index(multi_keyed_index)

list_value = kx.q('(`a;1.;"A")')
tab_value = kx.q('([] k1: 1. 2. 3.; k2: ("A";"B";"C"))')
dict_value = {"k1": [1., 2., 3.]}
keyed_tab_value = kx.q('([`a`b] k1: 1. 2.; k2: ("A";"B"))')
multi_keyed_value_index = kx.q('([]x:1 1 0;y:0 1 0)')
multi_keyed_value = kx.q('''([]a:`bar`foo`foo;
b:0 1 0)''').set_index(multi_keyed_value_index)

pd.testing.assert_frame_equal(tab.isin(list_value).pd(),
tab.pd().isin(list_value.py()))
pd.testing.assert_frame_equal(tab.isin(dict_value).pd(),
tab.pd().isin(dict_value))
pd.testing.assert_frame_equal(tab.isin(tab_value).pd(),
tab.pd().isin(tab_value.pd()))
pd.testing.assert_frame_equal(tab.isin(keyed_tab_value).pd(),
tab.pd().isin(keyed_tab_value.pd()))

pd.testing.assert_frame_equal(keyed_tab.isin(list_value).pd(),
keyed_tab.pd().isin(list_value.py()))
pd.testing.assert_frame_equal(keyed_tab.isin(dict_value).pd(),
keyed_tab.pd().isin(dict_value))
pd.testing.assert_frame_equal(keyed_tab.isin(tab_value).pd(),
keyed_tab.pd().isin(tab_value.pd()))
pd.testing.assert_frame_equal(keyed_tab.isin(keyed_tab_value).pd(),
keyed_tab.pd().isin(keyed_tab_value.pd()))
pd.testing.assert_frame_equal(keyed_tab.isin(multi_keyed_value).pd(),
keyed_tab.pd().isin(multi_keyed_value.pd()))

pd.testing.assert_frame_equal(multi_keyed_table.isin(list_value).pd(),
multi_keyed_table.pd().isin(list_value.py()))
pd.testing.assert_frame_equal(multi_keyed_table.isin(dict_value).pd(),
multi_keyed_table.pd().isin(dict_value))
pd.testing.assert_frame_equal(multi_keyed_table.isin(tab_value).pd(),
multi_keyed_table.pd().isin(tab_value.pd()))
pd.testing.assert_frame_equal(multi_keyed_table.isin(multi_keyed_value).pd(),
multi_keyed_table.pd().isin(multi_keyed_value.pd()))
pd.testing.assert_frame_equal(multi_keyed_table.isin(keyed_tab_value).pd(),
multi_keyed_table.pd().isin(keyed_tab_value.pd()))


def test_pandas_count(q):
tab = q('([] k1: 0n 2 0n 2 0n ; k2: (`a;`;`b;`;`c))')
df = tab.pd()
Expand Down
Loading