Feat/datacompy (#341)

# Release 0.8.2 This release makes it easier to build apps on top of buckaroo. Post processing functions can now hide columns CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict. This makes it easier to hard code summary_dict values. More resiliency around styling columns. Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used. [Datacompy_app](capitalone/datacompy#372) example built utilizing this new functionality. This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library
paddymul · Jan 16, 2025 · 99de0a1 · 99de0a1
1 parent 2618922
commit 99de0a1
Show file tree

Hide file tree

Showing 12 changed files with 634 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,3 +7,13 @@ It also moves the js code to `packages/buckaroo_js_core` This is a regular react
 
 None of the end user experience should change with this release.
 
+## 0.8.2 2025-01-15
+
+This release makes it easier to build apps on top of buckaroo.
+
+Post processing functions can now hide columns
+CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict.  This makes it easier to hard code summary_dict values.
+
+More resiliency around styling columns.  Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used.
+
+[Datacompy_app](https://github.com/capitalone/datacompy/issues/372) example built utilizing this new functionality.  This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library
diff --git a/README.md b/README.md
@@ -148,6 +148,19 @@ uv venv
 uv sync -q
 
 ``
+### Release instructions
+[github release instructions](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)
+
+```bash
+git tag $VERSION_NUMBER #no leading v
+update CHANGELOG.md
+#push code and tag to github
+```
+navigate to [create new buckaroo release](https://github.com/paddymul/buckaroo/releases/new)
+Follow instructions
+
+
+
 
 ## Contributions
 

diff --git a/buckaroo/dataflow/dataflow.py b/buckaroo/dataflow/dataflow.py
@@ -168,7 +168,7 @@ def _summary_sd(self, change):
         result_summary_sd = self._get_summary_sd(self.processed_df)
         self.summary_sd = result_summary_sd
 
-    @observe('summary_sd')
+    @observe('summary_sd', 'processed_result')
     @exception_protect('merged_sd-protector')
     def _merged_sd(self, change):
         #slightly inconsitent that processed_sd gets priority over
@@ -207,7 +207,12 @@ class CustomizableDataflow(DataFlow):
     def __init__(self, orig_df, debug=False,
                  column_config_overrides=None,
                  pinned_rows=None, extra_grid_config=None,
-                 component_config=None):
+                 component_config=None, init_sd=None):
+        if init_sd is None:
+            self.init_sd = {}
+        else:
+            self.init_sd = init_sd
+
         if column_config_overrides is None:
             column_config_overrides = {}
         self.column_config_overrides = column_config_overrides
@@ -273,6 +278,16 @@ def setup_options_from_analysis(self):
     df_data_dict = Any({'empty':[]}).tag(sync=True)
 
 
+    @observe('summary_sd', 'processed_result')
+    @exception_protect('merged_sd-protector')
+    def _merged_sd(self, change):
+        #slightly inconsitent that processed_sd gets priority over
+        #summary_sd, given that processed_df is computed first. My
+        #thinking was that processed_sd has greater total knowledge
+        #and should supersede summary_sd.
+        self.merged_sd = merge_sds(self.init_sd, self.cleaned_sd, self.summary_sd, self.processed_sd)
+
+
     ### start code interpreter block
     def add_command(self, incomingCommandKls):
         return self.ac_obj.add_command(incomingCommandKls)
@@ -374,6 +389,8 @@ def _handle_widget_change(self, change):
         self.df_display_args = temp_display_args
 
 """
+
+
 Instantiation
 df_data_dict starts with only 'empty'
 first populate df_display_args, make all data point to 'empty', make all df_viewer_configs EMPTY_DFVIEWER_CONFIG

diff --git a/buckaroo/dataflow/dataflow_extras.py b/buckaroo/dataflow/dataflow_extras.py
@@ -1,7 +1,12 @@
 import sys
+import logging
+
+
 import pandas as pd
 from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import (ColAnalysis)
 
+logger = logging.getLogger()
+
 EMPTY_DFVIEWER_CONFIG = {
     'pinned_rows': [],
     'column_config': []}
@@ -151,20 +156,32 @@ def style_column(kls, col, column_metadata):
     data_key = "main"
     summary_stats_key= 'all_stats'
 
+    @classmethod
+    def default_styling(kls, col_name):
+        return {'col_name': col_name, 'displayer_args': {'displayer': 'obj'}}
+
     @classmethod
     def style_columns(kls, sd):
         ret_col_config = []
-
         #this is necessary for polars to add an index column, which is
         #required so that summary_stats makes sense
         if 'index' not in sd:
-            ret_col_config.append({'col_name': 'index', 'displayer_args': {'displayer': 'obj'}})
+            ret_col_config.append(kls.default_styling('index'))
 
         for col in sd.keys():
             col_meta = sd[col]
-            base_style = kls.style_column(col, col_meta)
+            if col_meta.get('merge_rule') == 'hidden':
+                continue
+            try:
+                base_style = kls.style_column(col, col_meta)
+            except Exception:
+                logger.warn(f"Warning, styling failed from {kls} on column {col} with col_meta {col_meta} using default_styling instead")
+                base_style = kls.default_styling(col)
             if 'column_config_override' in col_meta:
+                #column_config_override, sent by the instantiation, gets set later
                 base_style.update(col_meta['column_config_override'])
+            if base_style.get('merge_rule') == 'hidden':
+                continue
             ret_col_config.append(base_style)
 
         return {
@@ -173,3 +190,4 @@ def style_columns(kls, sd):
             'extra_grid_config': kls.extra_grid_config,
             'component_config': kls.component_config
         }
+
diff --git a/docs/example-notebooks/Datacompy.ipynb b/docs/example-notebooks/Datacompy.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8f47719-21e9-4a99-bb44-73c4f8b99c3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import datacompy\n",
+    "from datacompy_app import DatacompyBuckaroo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a405785-0456-4a78-aec8-4e8045d7462a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create sample DataFrames\n",
+    "df_a = pd.DataFrame({\n",
+    "    'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
+    "    'b': [4, 5, 6, 4, 4, 6, 7, 8],\n",
+    "    'c': ['foo', 'foo', 'bar', None, None, 'bar', 'bar', 'foo'],\n",
+    "    'e': [100, 10, 1, 200, 150, 140, 130, 120]})\n",
+    "\n",
+    "df_b = pd.DataFrame({\n",
+    "    'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
+    "    'b': [4, 5, 7, 4, 4, 6, 4, 4],\n",
+    "    'd': ['foo', 'baz', 'baz', 'bar', None, None, 'bar', 'bar'],\n",
+    "    'f': [100, 10, 1, 200, 150, 140, 130, 120]\n",
+    "})  # Notice the difference in the last row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95d9ce1f-71b1-4fa2-bde6-8ffbc642c574",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DatacompyBuckaroo(df_a, df_b)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/example-notebooks/Styling-Howto.ipynb b/docs/example-notebooks/Styling-Howto.ipynb
@@ -22,12 +22,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "1",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Buckaroo has been enabled as the default DataFrame viewer.  To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
@@ -38,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "2",
    "metadata": {
     "tags": []
@@ -52,12 +60,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "3",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8273a2870220428f8ba3784ad246d176",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "BuckarooWidget(typed_df)"
    ]
@@ -73,12 +97,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "5",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "abbe10263ccb4bf1a013e500eb7e4a65",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "bw2 = BuckarooWidget(\n",
     "    typed_df, \n",
@@ -209,12 +249,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "13",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db49c1cad5af48c79acdf011253666dd",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "bw_ = BuckarooWidget(\n",
     "    typed_df, \n",
@@ -529,7 +585,14 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.20"
+   "version": "3.12.8"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
   }
  },
  "nbformat": 4,