Skip to content

Commit

Permalink
Added implementation of sem function (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
tortolavivo23 authored and Miguel Gómez committed Jan 22, 2024
1 parent f856311 commit 4df1ac7
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 0 deletions.
102 changes: 102 additions & 0 deletions docs/user-guide/advanced/Pandas_API.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,108 @@
"tab.mode(dropna=False)"
]
},
{
"cell_type": "markdown",
"id": "b248fef1",
"metadata": {},
"source": [
"### Table.sem()\n",
"\n",
"```\n",
"Table.sem(axis=0, skipna=True, numeric_only=False, ddof=0)\n",
"```\n",
"Return unbiased standard error of the mean over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument\n",
"\n",
"**Parameters:**\n",
"\n",
"| Name | Type | Description | Default |\n",
"| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n",
"| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n",
"| skipna | bool | not yet implemented | True |\n",
"| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n",
"| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n",
"\n",
"**Returns:**\n",
"\n",
"| Type | Description |\n",
"| :----------------: | :------------------------------------------------------------------- |\n",
"| Dictionary | The sem across each row / column with the key corresponding to the row number or column name. |"
]
},
{
"cell_type": "markdown",
"id": "71bd1d6f",
"metadata": {},
"source": [
"**Examples**\n",
"\n",
"Calculate the sem across the columns of a table"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "350c2b7c",
"metadata": {},
"outputs": [],
"source": [
"tab = kx.Table(data=\n",
" {\n",
" 'a': [1, 2, 2, 4],\n",
" 'b': [1, 2, 6, 7],\n",
" 'c': [7, 8, 9, 10],\n",
" 'd': [7, 11, 14, 14],\n",
" }\n",
" )\n",
"tab"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b89307e9",
"metadata": {},
"outputs": [],
"source": [
"tab.sem()"
]
},
{
"cell_type": "markdown",
"id": "6933f01f",
"metadata": {},
"source": [
"Calculate the sem across the rows of a table"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3edd3feb",
"metadata": {},
"outputs": [],
"source": [
"tab.sem(axis=1)"
]
},
{
"cell_type": "markdown",
"id": "ae7afe5a",
"metadata": {},
"source": [
"Calculate sem accross columns with ddof=0:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de626961",
"metadata": {},
"outputs": [],
"source": [
"tab.sem(ddof=0)"
]
},
{
"cell_type": "markdown",
"id": "7e2813b4",
Expand Down
24 changes: 24 additions & 0 deletions src/pykx/pandas_api/pandas_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,30 @@ def mode(self, axis: int = 0, numeric_only: bool = False, dropna: bool = True):
tab
)

@api_return
def sem(self, axis: int = 0, ddof: int = 1, numeric_only: bool = False):
tab = self
if 'Keyed' in str(type(tab)):
tab = q.value(tab)
if numeric_only:
tab = _get_numeric_only_subtable(tab)

key_str = '' if axis == 0 else '`$string '
val_str = '' if axis == 0 else '"f"$value '
query_str = 'cols[tab]' if axis == 0 else 'til[count[tab]]'
where_str = ' where not (::)~/:r[;1]'
sem_str = f'{{dev[x] % sqrt count[x]-{ddof}}}'

if ddof == len(tab):
return q(f'{{[tab]{query_str}!count[{query_str}]#0n}}', tab)

return q(
'{[tab]'
f'r:{{[tab; x] ({key_str}x; {sem_str} {val_str}tab[x])}}[tab;] each {query_str};'
f'(,/) {{(enlist x 0)!(enlist x 1)}} each r{where_str}}}',
tab
)

@api_return
def abs(self, numeric_only=False):
tab = self
Expand Down
80 changes: 80 additions & 0 deletions tests/test_pandas_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,86 @@ def test_df_sample(kx, q):
t.sample(ignore_index=True)


def test_sem(kx, q):
df = pd.DataFrame(
{
'a': [1, 2, 2, 4],
'b': [1, 2, 6, 7],
'c': [7, 8, 9, 10],
'd': [7, 11, 14, 14]
}
)
precision = 1e-16
tab = kx.toq(df)
p_m = df.sem()
q_m = tab.sem()
assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
for c in q.key(q_m).py()])

p_m = df.sem(axis=1)
q_m = tab.sem(axis=1)
assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
for c in range(len(q.cols(tab)))])

p_m = df.sem(ddof=0)
q_m = tab.sem(ddof=0)
assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
for c in q.key(q_m).py()])

p_m = df.sem(ddof=4)
q_m = tab.sem(ddof=4)
assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py())
for c in q.key(q_m).py()])

q['tab'] = kx.toq(df)
tab = q('1!`idx xcols update idx: til count tab from tab')
p_m = df.sem()
q_m = tab.sem()
assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
for c in q.key(q_m).py()])

p_m = df.sem(axis=1)
q_m = tab.sem(axis=1)
assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
for c in range(len(q.cols(tab)) - 1)])

df = pd.DataFrame(
{
'a': [1, 2, 2, 4],
'b': [1, 2, 6, 7],
'c': [7, 8, 9, 10],
'd': ['foo', 'bar', 'baz', 'qux']
}
)
tab = kx.toq(df)
p_m = df.sem(numeric_only=True)
q_m = tab.sem(numeric_only=True)
assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
for c in q.key(q_m).py()])

p_m = df.sem(axis=1, numeric_only=True)
q_m = tab.sem(axis=1, numeric_only=True)
assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
for c in range(len(q.cols(tab)))])

with pytest.raises(kx.QError):
q_m = tab.sem()
with pytest.raises(kx.QError):
q_m = tab.sem(axis=1)

df = pd.DataFrame({'a': [1]})
tab = kx.toq(df)
p_m = df.sem()
q_m = tab.sem()
assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py())
for c in q.key(q_m).py()])

p_m = df.sem(ddof=0)
q_m = tab.sem(ddof=0)
assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
for c in q.key(q_m).py()])


def test_mean(kx, q):
df = pd.DataFrame(
{
Expand Down

0 comments on commit 4df1ac7

Please sign in to comment.