Skip to content

Commit

Permalink
Feat/datacompy (#341)
Browse files Browse the repository at this point in the history
# Release 0.8.2
This release makes it easier to build apps on top of buckaroo.

Post processing functions can now hide columns
CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict.  This makes it easier to hard code summary_dict values.

More resiliency around styling columns.  Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used.

[Datacompy_app](capitalone/datacompy#372) example built utilizing this new functionality.  This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library
  • Loading branch information
paddymul authored Jan 16, 2025
1 parent 2618922 commit 99de0a1
Show file tree
Hide file tree
Showing 12 changed files with 634 additions and 50 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,13 @@ It also moves the js code to `packages/buckaroo_js_core` This is a regular react

None of the end user experience should change with this release.

## 0.8.2 2025-01-15

This release makes it easier to build apps on top of buckaroo.

Post processing functions can now hide columns
CustomizableDataflow (which all widgets extend) gets a new parameter of `init_sd` which is an initial summary_dict. This makes it easier to hard code summary_dict values.

More resiliency around styling columns. Previously if calls to `style_column` failed, an error would be thrown and the column would be hidden or an error thrown, now a default obj displayer is used.

[Datacompy_app](https://github.com/capitalone/datacompy/issues/372) example built utilizing this new functionality. This app compares dataframes with the [datacompy](https://github.com/capitalone/datacompy) library
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,19 @@ uv venv
uv sync -q

``
### Release instructions
[github release instructions](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository)

```bash
git tag $VERSION_NUMBER #no leading v
update CHANGELOG.md
#push code and tag to github
```
navigate to [create new buckaroo release](https://github.com/paddymul/buckaroo/releases/new)
Follow instructions




## Contributions

Expand Down
21 changes: 19 additions & 2 deletions buckaroo/dataflow/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def _summary_sd(self, change):
result_summary_sd = self._get_summary_sd(self.processed_df)
self.summary_sd = result_summary_sd

@observe('summary_sd')
@observe('summary_sd', 'processed_result')
@exception_protect('merged_sd-protector')
def _merged_sd(self, change):
#slightly inconsitent that processed_sd gets priority over
Expand Down Expand Up @@ -207,7 +207,12 @@ class CustomizableDataflow(DataFlow):
def __init__(self, orig_df, debug=False,
column_config_overrides=None,
pinned_rows=None, extra_grid_config=None,
component_config=None):
component_config=None, init_sd=None):
if init_sd is None:
self.init_sd = {}
else:
self.init_sd = init_sd

if column_config_overrides is None:
column_config_overrides = {}
self.column_config_overrides = column_config_overrides
Expand Down Expand Up @@ -273,6 +278,16 @@ def setup_options_from_analysis(self):
df_data_dict = Any({'empty':[]}).tag(sync=True)


@observe('summary_sd', 'processed_result')
@exception_protect('merged_sd-protector')
def _merged_sd(self, change):
#slightly inconsitent that processed_sd gets priority over
#summary_sd, given that processed_df is computed first. My
#thinking was that processed_sd has greater total knowledge
#and should supersede summary_sd.
self.merged_sd = merge_sds(self.init_sd, self.cleaned_sd, self.summary_sd, self.processed_sd)


### start code interpreter block
def add_command(self, incomingCommandKls):
return self.ac_obj.add_command(incomingCommandKls)
Expand Down Expand Up @@ -374,6 +389,8 @@ def _handle_widget_change(self, change):
self.df_display_args = temp_display_args

"""
Instantiation
df_data_dict starts with only 'empty'
first populate df_display_args, make all data point to 'empty', make all df_viewer_configs EMPTY_DFVIEWER_CONFIG
Expand Down
24 changes: 21 additions & 3 deletions buckaroo/dataflow/dataflow_extras.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import sys
import logging


import pandas as pd
from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import (ColAnalysis)

logger = logging.getLogger()

EMPTY_DFVIEWER_CONFIG = {
'pinned_rows': [],
'column_config': []}
Expand Down Expand Up @@ -151,20 +156,32 @@ def style_column(kls, col, column_metadata):
data_key = "main"
summary_stats_key= 'all_stats'

@classmethod
def default_styling(kls, col_name):
return {'col_name': col_name, 'displayer_args': {'displayer': 'obj'}}

@classmethod
def style_columns(kls, sd):
ret_col_config = []

#this is necessary for polars to add an index column, which is
#required so that summary_stats makes sense
if 'index' not in sd:
ret_col_config.append({'col_name': 'index', 'displayer_args': {'displayer': 'obj'}})
ret_col_config.append(kls.default_styling('index'))

for col in sd.keys():
col_meta = sd[col]
base_style = kls.style_column(col, col_meta)
if col_meta.get('merge_rule') == 'hidden':
continue
try:
base_style = kls.style_column(col, col_meta)
except Exception:
logger.warn(f"Warning, styling failed from {kls} on column {col} with col_meta {col_meta} using default_styling instead")
base_style = kls.default_styling(col)
if 'column_config_override' in col_meta:
#column_config_override, sent by the instantiation, gets set later
base_style.update(col_meta['column_config_override'])
if base_style.get('merge_rule') == 'hidden':
continue
ret_col_config.append(base_style)

return {
Expand All @@ -173,3 +190,4 @@ def style_columns(kls, sd):
'extra_grid_config': kls.extra_grid_config,
'component_config': kls.component_config
}

76 changes: 76 additions & 0 deletions docs/example-notebooks/Datacompy.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "d8f47719-21e9-4a99-bb44-73c4f8b99c3d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import datacompy\n",
"from datacompy_app import DatacompyBuckaroo"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a405785-0456-4a78-aec8-4e8045d7462a",
"metadata": {},
"outputs": [],
"source": [
"# Create sample DataFrames\n",
"df_a = pd.DataFrame({\n",
" 'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
" 'b': [4, 5, 6, 4, 4, 6, 7, 8],\n",
" 'c': ['foo', 'foo', 'bar', None, None, 'bar', 'bar', 'foo'],\n",
" 'e': [100, 10, 1, 200, 150, 140, 130, 120]})\n",
"\n",
"df_b = pd.DataFrame({\n",
" 'a': [1, 2, 3, 4, 5, 6, 7, 8],\n",
" 'b': [4, 5, 7, 4, 4, 6, 4, 4],\n",
" 'd': ['foo', 'baz', 'baz', 'bar', None, None, 'bar', 'bar'],\n",
" 'f': [100, 10, 1, 200, 150, 140, 130, 120]\n",
"}) # Notice the difference in the last row"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95d9ce1f-71b1-4fa2-bde6-8ffbc642c574",
"metadata": {},
"outputs": [],
"source": [
"DatacompyBuckaroo(df_a, df_b)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
83 changes: 73 additions & 10 deletions docs/example-notebooks/Styling-Howto.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "1",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
Expand All @@ -38,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "2",
"metadata": {
"tags": []
Expand All @@ -52,12 +60,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "3",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8273a2870220428f8ba3784ad246d176",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BuckarooWidget(typed_df)"
]
Expand All @@ -73,12 +97,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "5",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "abbe10263ccb4bf1a013e500eb7e4a65",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bw2 = BuckarooWidget(\n",
" typed_df, \n",
Expand Down Expand Up @@ -209,12 +249,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "13",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "db49c1cad5af48c79acdf011253666dd",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"BuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'post_pr…"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bw_ = BuckarooWidget(\n",
" typed_df, \n",
Expand Down Expand Up @@ -529,7 +585,14 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
"version": "3.12.8"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 99de0a1

Please sign in to comment.