diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb index abd5727..2516ac5 100644 --- a/docs/user-guide/advanced/Pandas_API.ipynb +++ b/docs/user-guide/advanced/Pandas_API.ipynb @@ -462,6 +462,91 @@ "tab.mean(axis=1)" ] }, + { + "cell_type": "markdown", + "id": "fe565b65-fbf2-47ba-a26e-791d09fd4f55", + "metadata": {}, + "source": [ + "### Table.kurt()\n", + "\n", + "```\n", + "Table.kurt(axis=0, skipna=True, numeric_only=False)\n", + "```\n", + "\n", + "Return unbiased kurtosis over requested axis. Kurtosis obtained using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1.\n", + "\n", + "\n", + "**Parameters:**\n", + "\n", + "| Name | Type | Description | Default |\n", + "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n", + "| axis | int | Axis for the function to be applied on. 0 is columns, 1 is rows. | 0 |\n", + "| skipna | bool | not yet implemented | True |\n", + "| numeric_only | bool | Only use columns of the table that are of a numeric data type. | False |\n", + "\n", + "**Returns:**\n", + "\n", + "| Type | Description |\n", + "| :--------: | :--------------------------------------------------------------------------------------- |\n", + "| Dictionary | Map of columns and their yielded kurtosis values |" + ] + }, + { + "cell_type": "markdown", + "id": "e6069cac-d260-4f80-9688-3d1ec273cd22", + "metadata": {}, + "source": [ + "**Examples:**\n", + "\n", + "Calculate the kurt across the columns of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4219c826-a84b-4722-9847-372d3837acdb", + "metadata": {}, + "outputs": [], + "source": [ + "tab = kx.Table(data=\n", + " {\n", + " 'a': [1, 2, 2, 4],\n", + " 'b': [1, 2, 6, 7],\n", + " 'c': [7, 8, 9, 10],\n", + " 'd': [7, 11, 14, 14]\n", + " }\n", + ")\n", + "tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "437ab485-bf73-4209-b63e-aa0d1bfa5d58", + "metadata": {}, + "outputs": [], + "source": [ + "tab.kurt()" + ] + }, + { + "cell_type": "markdown", + "id": "ea3e1cf6-2304-4061-a846-1cbc0572ea9d", + "metadata": {}, + "source": [ + "Calculate the kurtosis across the rows of a table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63312e8b-76f0-46eb-b4d7-b2213561c86e", + "metadata": {}, + "outputs": [], + "source": [ + "tab.kurt(axis=1)" + ] + }, { "cell_type": "markdown", "id": "7bf853c5", @@ -3099,7 +3184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py index 2c6a089..24b4b30 100644 --- a/src/pykx/pandas_api/pandas_meta.py +++ b/src/pykx/pandas_api/pandas_meta.py @@ -158,6 +158,33 @@ def mean(self, axis: int = 0, numeric_only: bool = False): tab ) + @api_return + def kurt(self, axis: int = 0, numeric_only: bool = False): + tab = self + if 'Keyed' in str(type(tab)): + tab = q.value(tab) + if numeric_only: + tab = _get_numeric_only_subtable(tab) + + key_str = '' if axis == 0 else '`$string ' + val_str = '' if axis == 0 else '"f"$value ' + query_str = 'cols tab' if axis == 0 else 'til count tab' + where_str = ' where not (::)~/:r[;1]' + kurt_str = ('{res: x - avg x;' + 'n: count x;' + 'm2: sum res_sq: res xexp 2;' + 'm4: sum res_sq xexp 2;' + 'adj: 3 * xexp[n - 1;2] % (n - 2) * (n - 3);' + 'num: n * (n + 1) * (n - 1) * m4;' + 'den: (n - 2) * (n - 3) * m2 xexp 2;' + '(num % den) - adj}') + return q( + '{[tab]' + f'r:{{[tab; x] ({key_str}x; {kurt_str} {val_str}tab[x])}}[tab;] each {query_str};' + f'(,/) {{(enlist x 0)!(enlist x 1)}} each r{where_str}}}', + tab + ) + @api_return def median(self, axis: int = 0, numeric_only: bool = False): tab = self diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py index 690708e..ec76223 100644 --- a/tests/test_pandas_api.py +++ b/tests/test_pandas_api.py @@ -1552,6 +1552,79 @@ def test_mean(kx, q): q_m = tab.mean(axis=1) +def test_kurt(kx, q): + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [7, 11, 14, 14] + } + ) + tab = kx.toq(df) + p_m = df.kurt() + q_m = tab.kurt() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1) + q_m = tab.kurt(axis=1) + for c in range(len(q.cols(tab))): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + q['tab'] = kx.toq(df) + tab = q('1!`idx xcols update idx: til count tab from tab') + p_m = df.kurt() + q_m = tab.kurt() + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1) + q_m = tab.kurt(axis=1) + for c in range(len(q.cols(tab)) - 1): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.kurt(numeric_only=True) + q_m = tab.kurt(numeric_only=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1, numeric_only=True) + q_m = tab.kurt(axis=1, numeric_only=True) + for c in range(len(q.cols(tab))): + assert np.isnan(p_m[c]) & np.isnan(q_m[q('{`$string x}', c)].py()) + + df = pd.DataFrame( + { + 'a': [1, 2, 2, 4], + 'b': [1, 2, 6, 7], + 'c': [7, 8, 9, 10], + 'd': [11, 12, 13, 14], + 'e': ['foo', 'bar', 'baz', 'qux'] + } + ) + tab = kx.toq(df) + p_m = df.kurt(numeric_only=True) + q_m = tab.kurt(numeric_only=True) + for c in q.key(q_m).py(): + assert p_m[c] == q_m[c].py() + p_m = df.kurt(axis=1, numeric_only=True) + q_m = tab.kurt(axis=1, numeric_only=True) + for c in range(len(q.cols(tab)) - 1): + assert p_m[c] == q_m[q('{`$string x}', c)].py() + + with pytest.raises(kx.QError): + q_m = tab.kurt() + with pytest.raises(kx.QError): + q_m = tab.kurt(axis=1) + + def test_median(kx, q): df = pd.DataFrame( {