diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index 2516ac5..7d6449d 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -757,6 +757,108 @@ "tab.mode(dropna=False)" ] }, + { + "cell_type": "markdown", + "id": "b248fef1", + "metadata": {}, + "source": [ + "### Table.sem()\n", + "\n", + "```\n", + "Table.sem(axis=0, skipna=True, numeric_only=False, ddof=0)\n", + "```\n", + "Return unbiased standard error of the mean over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | The axis to calculate the sum across 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "| ddof | int | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :----------------: | :------------------------------------------------------------------- |\n", + "| Dictionary | The sem across each row / column with the key corresponding to the row number or column name. |" + ] + }, + { + "cell_type": "markdown", + "id": "71bd1d6f", + "metadata": {}, + "source": [ + "**Examples**\n", + "\n", + "Calculate the sem across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "350c2b7c", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14],\n", + " }\n", + " )\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b89307e9", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem()" + ] + }, + { + "cell_type": "markdown", + "id": "6933f01f", + "metadata": {}, + "source": [ + "Calculate the sem across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3edd3feb", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ae7afe5a", + "metadata": {}, + "source": [ + "Calculate sem accross columns with ddof=0:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de626961", + "metadata": {}, + "outputs": [], + "source": [ + "tab.sem(ddof=0)" + ] + }, { "cell_type": "markdown", "id": "7e2813b4", diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 24b4b30..f2a4128 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -234,6 +234,30 @@ def mode(self, axis: int = 0, numeric_only: bool = False, dropna: bool = True): tab ) + @api_return + def sem(self, axis: int = 0, ddof: int = 1, numeric_only: bool = False): + tab = self + if 'Keyed' in str(type(tab)): + tab = q.value(tab) + if numeric_only: + tab = _get_numeric_only_subtable(tab) + + key_str = '' if axis == 0 else '`$string ' + val_str = '' if axis == 0 else '"f"$value ' + query_str = 'cols[tab]' if axis == 0 else 'til[count[tab]]' + where_str = ' where not (::)~/:r[;1]' + sem_str = f'{{dev[x] % sqrt count[x]-{ddof}}}' + + if ddof == len(tab): + return q(f'{{[tab]{query_str}!count[{query_str}]#0n}}', tab) + + return q( + '{[tab]' + f'r:{{[tab; x] ({key_str}x; {sem_str} {val_str}tab[x])}}[tab;] each {query_str};' + f'(,/) {{(enlist x 0)!(enlist x 1)}} each r{where_str}}}', + tab + ) + @api_return def abs(self, numeric_only=False): tab = self diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index ec76223..479d428 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -1498,6 +1498,86 @@ def test_df_sample(kx, q): t.sample(ignore_index=True) +def test_sem(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [7, 11, 14, 14] + } + ) + precision = 1e-16 + tab = kx.toq(df) + p_m = df.sem() + q_m = tab.sem() + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1) + q_m = tab.sem(axis=1) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)))]) + + p_m = df.sem(ddof=0) + q_m = tab.sem(ddof=0) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(ddof=4) + q_m = tab.sem(ddof=4) + assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py()) + for c in q.key(q_m).py()]) + + q['tab'] = kx.toq(df) + tab = q('1!`idx xcols update idx: til count tab from tab') + p_m = df.sem() + q_m = tab.sem() + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1) + q_m = tab.sem(axis=1) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)) - 1)]) + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.sem(numeric_only=True) + q_m = tab.sem(numeric_only=True) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + p_m = df.sem(axis=1, numeric_only=True) + q_m = tab.sem(axis=1, numeric_only=True) + assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision) + for c in range(len(q.cols(tab)))]) + + with pytest.raises(kx.QError): + q_m = tab.sem() + with pytest.raises(kx.QError): + q_m = tab.sem(axis=1) + + df = pd.DataFrame({'a': [1]}) + tab = kx.toq(df) + p_m = df.sem() + q_m = tab.sem() + assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py()) + for c in q.key(q_m).py()]) + + p_m = df.sem(ddof=0) + q_m = tab.sem(ddof=0) + assert all([p_m[c] == pytest.approx(q_m[c].py(), precision) + for c in q.key(q_m).py()]) + + def test_mean(kx, q): df = pd.DataFrame( {