Added implementation of sem function (#22)

hablapps · Jan 22, 2024 · 4df1ac7 · 4df1ac7
1 parent f856311
commit 4df1ac7
Show file tree

Hide file tree

Showing 3 changed files with 206 additions and 0 deletions.
diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb
@@ -757,6 +757,108 @@
     "tab.mode(dropna=False)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b248fef1",
+   "metadata": {},
+   "source": [
+    "### Table.sem()\n",
+    "\n",
+    "```\n",
+    "Table.sem(axis=0, skipna=True, numeric_only=False, ddof=0)\n",
+    "```\n",
+    "Return unbiased standard error of the mean over requested axis. Normalized by N-1 by default. This can be changed using the ddof argument\n",
+    "\n",
+    "**Parameters:**\n",
+    "\n",
+    "| Name         | Type | Description                                                                      | Default |\n",
+    "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n",
+    "| axis         | int  | The axis to calculate the sum across 0 is columns, 1 is rows.                    | 0       |\n",
+    "| skipna       | bool | not yet implemented                                           | True    |\n",
+    "| numeric_only | bool | Only use columns of the table that are of a numeric data type.                   | False   |\n",
+    "| ddof    | int  | Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. | 1 |\n",
+    "\n",
+    "**Returns:**\n",
+    "\n",
+    "| Type               | Description                                                          |\n",
+    "| :----------------: | :------------------------------------------------------------------- |\n",
+    "| Dictionary         | The sem across each row / column with the key corresponding to the row number or column name. |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71bd1d6f",
+   "metadata": {},
+   "source": [
+    "**Examples**\n",
+    "\n",
+    "Calculate the sem across the columns of a table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "350c2b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab = kx.Table(data=\n",
+    "        {\n",
+    "            'a': [1, 2, 2, 4],\n",
+    "            'b': [1, 2, 6, 7],\n",
+    "            'c': [7, 8, 9, 10],\n",
+    "            'd': [7, 11, 14, 14],\n",
+    "        }\n",
+    "    )\n",
+    "tab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b89307e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab.sem()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6933f01f",
+   "metadata": {},
+   "source": [
+    "Calculate the sem across the rows of a table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3edd3feb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab.sem(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae7afe5a",
+   "metadata": {},
+   "source": [
+    "Calculate sem accross columns with ddof=0:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de626961",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tab.sem(ddof=0)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7e2813b4",

diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py
@@ -234,6 +234,30 @@ def mode(self, axis: int = 0, numeric_only: bool = False, dropna: bool = True):
             tab
         )
 
+    @api_return
+    def sem(self, axis: int = 0, ddof: int = 1, numeric_only: bool = False):
+        tab = self
+        if 'Keyed' in str(type(tab)):
+            tab = q.value(tab)
+        if numeric_only:
+            tab = _get_numeric_only_subtable(tab)
+
+        key_str = '' if axis == 0 else '`$string '
+        val_str = '' if axis == 0 else '"f"$value '
+        query_str = 'cols[tab]' if axis == 0 else 'til[count[tab]]'
+        where_str = ' where not (::)~/:r[;1]'
+        sem_str = f'{{dev[x] % sqrt count[x]-{ddof}}}'
+
+        if ddof == len(tab):
+            return q(f'{{[tab]{query_str}!count[{query_str}]#0n}}', tab)
+
+        return q(
+            '{[tab]'
+            f'r:{{[tab; x] ({key_str}x; {sem_str} {val_str}tab[x])}}[tab;] each {query_str};'
+            f'(,/) {{(enlist x 0)!(enlist x 1)}} each r{where_str}}}',
+            tab
+        )
+
     @api_return
     def abs(self, numeric_only=False):
         tab = self

diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py
@@ -1498,6 +1498,86 @@ def test_df_sample(kx, q):
         t.sample(ignore_index=True)
 
 
+def test_sem(kx, q):
+    df = pd.DataFrame(
+        {
+            'a': [1, 2, 2, 4],
+            'b': [1, 2, 6, 7],
+            'c': [7, 8, 9, 10],
+            'd': [7, 11, 14, 14]
+        }
+    )
+    precision = 1e-16
+    tab = kx.toq(df)
+    p_m = df.sem()
+    q_m = tab.sem()
+    assert all([p_m[c] == pytest.approx(q_m[c].py(),  precision)
+                for c in q.key(q_m).py()])
+
+    p_m = df.sem(axis=1)
+    q_m = tab.sem(axis=1)
+    assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
+                for c in range(len(q.cols(tab)))])
+
+    p_m = df.sem(ddof=0)
+    q_m = tab.sem(ddof=0)
+    assert all([p_m[c] == pytest.approx(q_m[c].py(),  precision)
+                for c in q.key(q_m).py()])
+
+    p_m = df.sem(ddof=4)
+    q_m = tab.sem(ddof=4)
+    assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py())
+                for c in q.key(q_m).py()])
+
+    q['tab'] = kx.toq(df)
+    tab = q('1!`idx xcols update idx: til count tab from tab')
+    p_m = df.sem()
+    q_m = tab.sem()
+    assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
+                for c in q.key(q_m).py()])
+
+    p_m = df.sem(axis=1)
+    q_m = tab.sem(axis=1)
+    assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
+                for c in range(len(q.cols(tab)) - 1)])
+
+    df = pd.DataFrame(
+        {
+            'a': [1, 2, 2, 4],
+            'b': [1, 2, 6, 7],
+            'c': [7, 8, 9, 10],
+            'd': ['foo', 'bar', 'baz', 'qux']
+        }
+    )
+    tab = kx.toq(df)
+    p_m = df.sem(numeric_only=True)
+    q_m = tab.sem(numeric_only=True)
+    assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
+                for c in q.key(q_m).py()])
+
+    p_m = df.sem(axis=1, numeric_only=True)
+    q_m = tab.sem(axis=1, numeric_only=True)
+    assert all([p_m[c] == pytest.approx(q_m[q('{`$string x}', c)].py(), precision)
+                for c in range(len(q.cols(tab)))])
+
+    with pytest.raises(kx.QError):
+        q_m = tab.sem()
+    with pytest.raises(kx.QError):
+        q_m = tab.sem(axis=1)
+
+    df = pd.DataFrame({'a': [1]})
+    tab = kx.toq(df)
+    p_m = df.sem()
+    q_m = tab.sem()
+    assert all([np.isnan(p_m[c]) & np.isnan(q_m[c].py())
+                for c in q.key(q_m).py()])
+
+    p_m = df.sem(ddof=0)
+    q_m = tab.sem(ddof=0)
+    assert all([p_m[c] == pytest.approx(q_m[c].py(), precision)
+                for c in q.key(q_m).py()])
+
+
 def test_mean(kx, q):
     df = pd.DataFrame(
         {