Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

996 profiling upgrades #1379

Closed
wants to merge 23 commits into from
Closed
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9dcc08c
Inital functional KDE graphs in profile data
sama-ds Jun 20, 2023
e187b09
Added functionality to turn on/off KDE charts when using profile colu…
sama-ds Jun 29, 2023
bc54397
Minor logic change.
sama-ds Jun 29, 2023
2d8b2c8
Linting changes and removing print statements used for debugging
sama-ds Jun 29, 2023
14fbeb9
lint with black
sama-ds Jun 29, 2023
1c4aa57
Functional changes - KDE yet to be finished so currently being worked on
sama-ds Aug 10, 2023
36a43f0
lint with black
sama-ds Aug 10, 2023
a4069fb
Removed code around correlation plot and moved to seperate pull reque…
sama-ds Aug 30, 2023
092df47
Added docstring
sama-ds Aug 30, 2023
83bdca0
Merge remote-tracking branch 'origin/master' into 996_profiling_upgrades
sama-ds Aug 30, 2023
d33cf28
Added example use to tutorial notebook.
sama-ds Aug 30, 2023
22fd8e6
lint with black
sama-ds Aug 30, 2023
5038bc6
profile_numeric_columns functionality
sama-ds Sep 22, 2023
879b37b
Merge branch 'master' into 996_profiling_upgrades
sama-ds Sep 22, 2023
0530de3
lint with black
sama-ds Sep 22, 2023
95f1b44
Updating docs to reflect changes
sama-ds Sep 22, 2023
6501773
Minor changes to appease linter
sama-ds Sep 22, 2023
8d732dc
lint with black
sama-ds Sep 22, 2023
5b2ba37
Removing un-uncessary files and code that were originally part of the…
sama-ds Nov 8, 2023
9b11ed5
Making minor changes on formatting a typecasting
sama-ds Nov 8, 2023
33e6891
lint with black
sama-ds Nov 8, 2023
e194bc1
Adding additional tests and error protection
sama-ds Nov 8, 2023
f96fafb
Fixed error catching
sama-ds Nov 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions 996.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from splink.duckdb.linker import DuckDBLinker
from IPython.display import display

df = pd.DataFrame(
[
{"unique_id": 1, "name": "chris", "test": 1, "test2": 1100, "test3": 0.2},
{"unique_id": 2, "name": "chris", "test": 1, "test2": 11100, "test3": 0.3},
{"unique_id": 3, "name": "sam", "test": 2, "test2": 1100, "test3": 0.4},
{"unique_id": 4, "name": "sam", "test": 2, "test2": 212150, "test3": 0.5},
{"unique_id": 5, "name": "sam", "test": 3, "test2": 221150, "test3": 0.6},
{"unique_id": 6, "name": "sam", "test": 4, "test2": 175, "test3": 0.7},
]
)

linker = DuckDBLinker(df)

# linker.debug_mode=True

# Classic

charts = linker.profile_columns(
column_expressions=["test", "test2", "test3"],
top_n=10,
bottom_n=5,
)

display(charts)

charts2 = linker.profile_columns(
column_expressions=["test", "test2", "test3"],
top_n=10,
bottom_n=None,
distribution_plots=False,
)

display(charts2)

charts3 = linker.profile_numeric_columns(
column_expressions=[
"test",
"test2",
"test3",
],
top_n=10,
bottom_n=5,
kde_plots=True,
)

display(charts3)

# charts4=linker.profile_columns(
# ["test", "test2", "test3"],
# top_n=None,
# bottom_n=None,
# distribution_plots=None,
# kde_plots=None,
# correlation_plot=True
# )

# display(charts4)
2,566 changes: 2,402 additions & 164 deletions docs/demos/tutorials/02_Exploratory_analysis.ipynb

Large diffs are not rendered by default.

79 changes: 1 addition & 78 deletions splink/files/chart_defs/profile_data.json
Original file line number Diff line number Diff line change
@@ -1,80 +1,3 @@
{
"hconcat": [
{
"data": { "values": null },
"mark": { "type": "line", "interpolate": "step-after" },
"encoding": {
"x": {
"type": "quantitative",
"field": "percentile_ex_nulls",
"sort": "descending",
"title": "Percentile"
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Count of values"
},
"tooltip": [
{ "field": "value_count", "type": "quantitative" },
{ "field": "percentile_ex_nulls", "type": "quantitative" },
{ "field": "percentile_inc_nulls", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": {
"text": "Distribution of counts of values in column",
"subtitle": "Subtitle Text"
}
},
{
"data": { "values": null },
"mark": "bar",
"encoding": {
"x": {
"type": "nominal",
"field": "value",
"sort": "-y",
"title": null
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Value count"
},
"tooltip": [
{ "field": "value", "type": "nominal" },
{ "field": "value_count", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": "Top 20 values by value count"
},
{
"data": { "values": [] },
"mark": "bar",
"encoding": {
"x": {
"type": "nominal",
"field": "value",
"sort": "-y",
"title": null
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Value count"
},
"tooltip": [
{ "field": "value", "type": "nominal" },
{ "field": "value_count", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": "Bottom 20 values by value count"
}
]

}
24 changes: 24 additions & 0 deletions splink/files/chart_defs/profile_data_bottom_n.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"data": { "values": [] },
"mark": "bar",
"encoding": {
"x": {
"type": "nominal",
"field": "value",
"sort": "-y",
"title": null
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Value count"
},
"tooltip": [
{ "field": "value", "type": "nominal" },
{ "field": "value_count", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": "Bottom 20 values by value count"
}
28 changes: 28 additions & 0 deletions splink/files/chart_defs/profile_data_distribution_plots.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"data": { "values": null },
"mark": { "type": "line", "interpolate": "step-after" },
"encoding": {
"x": {
"type": "quantitative",
"field": "percentile_ex_nulls",
"sort": "descending",
"title": "Percentile"
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Count of values"
},
"tooltip": [
{ "field": "value_count", "type": "quantitative" },
{ "field": "percentile_ex_nulls", "type": "quantitative" },
{ "field": "percentile_inc_nulls", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": {
"text": "Distribution of counts of values in column",
"subtitle": "Subtitle Text"
}
}
21 changes: 21 additions & 0 deletions splink/files/chart_defs/profile_data_kde.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"data": { "values": [] },
"mark": "area",
"encoding": {
"x": {
"type": "nominal",
"field": "value",
"title": "Value"
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Density"
},
"tooltip": [
{ "field": "value", "type": "nominal" },
{ "field": "value_count", "type": "quantitative" }
]
},
"title": "Kernel Density Plot"
}
24 changes: 24 additions & 0 deletions splink/files/chart_defs/profile_data_top_n.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"data": { "values": null },
"mark": "bar",
"encoding": {
"x": {
"type": "nominal",
"field": "value",
"sort": "-y",
"title": null
},
"y": {
"type": "quantitative",
"field": "value_count",
"title": "Value count"
},
"tooltip": [
{ "field": "value", "type": "nominal" },
{ "field": "value_count", "type": "quantitative" },
{ "field": "total_non_null_rows", "type": "quantitative" },
{ "field": "total_rows_inc_nulls", "type": "quantitative" }
]
},
"title": "Top 20 values by value count"
}
87 changes: 27 additions & 60 deletions splink/linker.py
Original file line number Diff line number Diff line change
@@ -2080,69 +2080,36 @@ def cluster_pairwise_predictions_at_threshold(
return cc

def profile_columns(
self, column_expressions: str | list[str] = None, top_n=10, bottom_n=10
self,
column_expressions: str | list[str],
top_n: int = 10,
bottom_n: int = 10,
distribution_plots: bool = True,
):
"""
Profiles the specified columns of the dataframe initiated with the linker.
This can be computationally expensive if the dataframe is large.
For the provided columns with column_expressions (or for all columns if
left empty) calculate:
- A distribution plot that shows the count of values at each percentile.
- A top n chart, that produces a chart showing the count of the top n values
within the column
- A bottom n chart, that produces a chart showing the count of the bottom
n values within the column
This should be used to explore the dataframe, determine if columns have
sufficient completeness for linking, analyse the cardinality of columns, and
identify the need for standardisation within a given column.
Args:
linker (object): The initiated linker.
column_expressions (list, optional): A list of strings containing the
specified column names.
If left empty this will default to all columns.
top_n (int, optional): The number of top n values to plot.
bottom_n (int, optional): The number of bottom n values to plot.
Returns:
altair.Chart or dict: A visualization or JSON specification describing the
profiling charts.
Examples:
=== ":simple-duckdb: DuckDB"
```py
linker = DuckDBLinker(df)
linker.profile_columns()
```
=== ":simple-apachespark: Spark"
```py
linker = SparkLinker(df)
linker.profile_columns()
```
=== ":simple-amazonaws: Athena"
```py
linker = AthenaLinker(df)
linker.profile_columns()
```
=== ":simple-sqlite: SQLite"
```py
linker = SQLiteLinker(df)
linker.profile_columns()
```
Note:
- The `linker` object should be an instance of the initiated linker.
- The provided `column_expressions` can be a list of column names to
profile. If left empty, all columns will be profiled.
- The `top_n` and `bottom_n` parameters determine the number of top and
bottom values to display in the respective charts.
"""
return profile_columns(
self,
column_expressions,
top_n=top_n,
bottom_n=bottom_n,
kde_plots=False,
distribution_plots=distribution_plots,
)

def profile_numeric_columns(
self,
column_expressions: str | list[str],
top_n: int = 10,
bottom_n: int = 10,
kde_plots: bool = False,
distribution_plots: bool = True,
):
return profile_columns(
self, column_expressions=column_expressions, top_n=top_n, bottom_n=bottom_n
self,
column_expressions,
top_n=top_n,
bottom_n=bottom_n,
kde_plots=kde_plots,
distribution_plots=distribution_plots,
)

def _get_labels_tablename_from_input(
256 changes: 178 additions & 78 deletions splink/profile_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import re
from copy import deepcopy

from .charts import altair_or_json, load_chart_definition
from .misc import ensure_is_list

logger = logging.getLogger(__name__)


def _group_name(cols_or_expr):
cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr)
@@ -28,48 +31,85 @@ def expressions_to_sql(expressions):
"$schema": "https://vega.github.io/schema/vega-lite/v5.9.3.json",
}

chart_path = "profile_data.json"
_inner_chart_spec_freq = load_chart_definition(chart_path)
_inner_chart_spec = load_chart_definition("profile_data.json")
_distribution_plotss_plot = load_chart_definition(
"profile_data_distribution_plots.json"
)
_top_n_plot = load_chart_definition("profile_data_top_n.json")
_bottom_n_plot = load_chart_definition("profile_data_bottom_n.json")
_kde_plot = load_chart_definition("profile_data_kde.json")


def _get_inner_chart_spec_freq(
col_name,
percentile_data=None,
top_n_data=None,
bottom_n_data=None,
kde_data=None,
):

inner_spec = deepcopy(_inner_chart_spec)
inner_specs = []

if percentile_data is not None:
_distribution_plotss_plot_copy = deepcopy(_distribution_plotss_plot)
total_rows_inc_nulls = percentile_data[0]["total_rows_inc_nulls"]
total_non_null_rows = percentile_data[0]["total_non_null_rows"]
distinct_value_count = percentile_data[0]["distinct_value_count"]
perc = total_non_null_rows / total_rows_inc_nulls

sub = (
f"In this col, {total_rows_inc_nulls*(1-perc):,.0f} values "
f"({1-perc:,.1%}) are null and there are "
f"{distinct_value_count} distinct values"
)
sub = sub.format(**percentile_data[0])
_distribution_plotss_plot_copy["data"]["values"] = percentile_data
_distribution_plotss_plot_copy["title"][
"text"
] = f"Distribution of counts of values in column {col_name}"

_distribution_plotss_plot_copy["title"]["subtitle"] = sub

inner_specs.append(_distribution_plotss_plot_copy)

def _get_inner_chart_spec_freq(percentile_data, top_n_data, bottom_n_data, col_name):
inner_spec = deepcopy(_inner_chart_spec_freq)
if top_n_data is not None:
_top_n_plot_copy = deepcopy(_top_n_plot)
_top_n_plot_copy["data"]["values"] = top_n_data
_top_n_plot_copy["title"] = f"Top {len(top_n_data)} values by value count"

total_rows_inc_nulls = percentile_data[0]["total_rows_inc_nulls"]
total_non_null_rows = percentile_data[0]["total_non_null_rows"]
distinct_value_count = percentile_data[0]["distinct_value_count"]
perc = total_non_null_rows / total_rows_inc_nulls
inner_specs.append(_top_n_plot_copy)

sub = (
f"In this col, {total_rows_inc_nulls*(1-perc):,.0f} values "
f"({1-perc:,.1%}) are null and there are "
f"{distinct_value_count} distinct values"
)
sub = sub.format(**percentile_data[0])
inner_spec["hconcat"][0]["data"]["values"] = percentile_data
inner_spec["hconcat"][0]["title"][
"text"
] = f"Distribution of counts of values in column {col_name}"
if bottom_n_data is not None:
_bottom_n_plot_copy = deepcopy(_bottom_n_plot)
_bottom_n_plot_copy["data"]["values"] = bottom_n_data
_bottom_n_plot_copy[
"title"
] = f"Bottom {len(bottom_n_data)} values by value count"

inner_spec["hconcat"][0]["title"]["subtitle"] = sub
max_val = top_n_data[0]["value_count"]
_bottom_n_plot_copy["encoding"]["y"]["scale"] = {"domain": [0, max_val]}

inner_spec["hconcat"][1]["data"]["values"] = top_n_data
inner_spec["hconcat"][1]["title"] = f"Top {len(top_n_data)} values by value count"
inner_specs.append(_bottom_n_plot_copy)

inner_spec["hconcat"][2]["data"]["values"] = bottom_n_data
inner_spec["hconcat"][2][
"title"
] = f"Bottom {len(bottom_n_data)} values by value count"
if kde_data is not None:
_kde_plot_copy = deepcopy(_kde_plot)
_kde_plot_copy["data"]["values"] = kde_data
_kde_plot_copy["title"] = "Kernel Density Estimation"
_kde_plot_copy["mark"] = "area"
_kde_plot_copy["encoding"]["x"]["field"] = "value"
_kde_plot_copy["encoding"]["y"]["field"] = "value_count"

max_val = top_n_data[0]["value_count"]
inner_spec["hconcat"][2]["encoding"]["y"]["scale"] = {"domain": [0, max_val]}
inner_specs.append(_kde_plot_copy)

inner_spec["hconcat"] = inner_specs

return inner_spec


def _get_df_percentiles():
"""Take __splink__df_all_column_value_frequencies and
turn it into the raw data needed for the percentile cahrt
turn it into the raw data needed for the percentile chart
"""

sqls = []
@@ -117,6 +157,17 @@ def _get_df_percentiles():
return sqls


def _get_df_kde():
sql = """
select
value,
value_count,
group_name,
from __splink__df_all_column_value_frequencies
"""
return sql


def _get_df_top_bottom_n(expressions, limit=20, value_order="desc"):
sql = """
select * from
@@ -190,46 +241,60 @@ def _add_100_percentile_to_df_percentiles(percentile_rows):
return percentile_rows


def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
def profile_columns(
linker,
column_expressions: str,
top_n: int = 10,
bottom_n: int = 10,
distribution_plots: bool = True,
kde_plots: bool = False,
):
"""
Profiles the specified columns of the dataframe initiated with the linker.
This can be computationally expensive if the dataframe is large.
For the provided columns with column_expressions (or for all columns if left empty)
calculate:
we can calculate:
- A distribution plot that shows the count of values at each percentile.
- A top n chart, that produces a chart showing the count of the top n values
within the column
- A bottom n chart, that produces a chart showing the count of the bottom
n values within the column
- A bottom n chart, that produces a chart showing the count of the bottom n values
within the column
- A kernel density plot of a numeric variable.
This should be used to explore the dataframe, determine if columns have
sufficient completeness for linking, analyse the cardinality of columns, and
identify the need for standardisation within a given column.
This should be used to explore the dataframe, determine if columns have sufficient
completeness for linking, analyse the cardinality of columns, and identify the need
for standardisation within a given column.
Args:
linker (object): The initiated linker.
column_expressions (list, optional): A list of strings containing the
specified column names.
If left empty this will default to all columns.
column_expressions (list, optional): A list of strings containing the specified
column names. If left empty this will default to all columns.
top_n (int, optional): The number of top n values to plot.
Will default to 10, but if set to None the chart will not be
produced.
bottom_n (int, optional): The number of bottom n values to plot.
Will default to 10, but if set to None the chart will not be produced.
kde_plot (bool, optional): A boolean value indicating whether kde plots should be
produced.
distribution_plots (bool, optional): A boolean value indicating whether
distribution plots should be produced.
Returns:
altair.Chart or dict: A visualization or JSON specification describing the
profiling charts.
profiling charts.
Note:
- The `linker` object should be an instance of the initiated linker.
- The provided `column_expressions` can be a list of column names to profile.
If left empty, all columns will be profiled.
If left empty, all columns will be profiled.
- The `top_n` and `bottom_n` parameters determine the number of top and bottom
values to display in the respective charts.
values to display in the respective charts.
"""

if not column_expressions:
column_expressions = linker._get_input_columns
if top_n is None and bottom_n is None and not distribution_plots and not kde_plots:
logger.warning(
"Warning: No charts in profile_columns have been selected."
)
return None


df_concat = linker._initialise_df_concat()

@@ -243,48 +308,83 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
sql = _col_or_expr_frequencies_raw_data_sql(
column_expressions_raw, "__splink__df_concat"
)

linker._enqueue_sql(sql, "__splink__df_all_column_value_frequencies")
df_raw = linker._execute_sql_pipeline(input_dataframes)

sqls = _get_df_percentiles()
for sql in sqls:
linker._enqueue_sql(sql["sql"], sql["output_table_name"])

df_percentiles = linker._execute_sql_pipeline([df_raw])
percentile_rows_all = df_percentiles.as_record_dict()

sql = _get_df_top_bottom_n(column_expressions, top_n, "desc")
linker._enqueue_sql(sql, "__splink__df_top_n")
df_top_n = linker._execute_sql_pipeline([df_raw])
top_n_rows_all = df_top_n.as_record_dict()

sql = _get_df_top_bottom_n(column_expressions, bottom_n, "asc")
linker._enqueue_sql(sql, "__splink__df_bottom_n")
df_bottom_n = linker._execute_sql_pipeline([df_raw])
bottom_n_rows_all = df_bottom_n.as_record_dict()
if distribution_plots:
sqls = _get_df_percentiles()
for sql in sqls:
linker._enqueue_sql(sql["sql"], sql["output_table_name"])
df_percentiles = linker._execute_sql_pipeline([df_raw])
percentile_rows_all = df_percentiles.as_record_dict()
else:
percentile_rows_all = None
percentile_rows = None

if top_n is not None:
sql = _get_df_top_bottom_n(column_expressions, top_n, "desc")
linker._enqueue_sql(sql, "__splink__df_top_n")
df_top_n = linker._execute_sql_pipeline([df_raw])
top_n_rows_all = df_top_n.as_record_dict()
else:
top_n_rows_all = None
top_n_rows = None

if kde_plots:
sql = _get_df_kde()
linker._enqueue_sql(sql, "__splink__df_kde")
df_kde = linker._execute_sql_pipeline([df_raw])
kde_rows_all = df_kde.as_record_dict()
else:
kde_rows_all = None
kde_rows = None

if bottom_n is not None:
sql = _get_df_top_bottom_n(column_expressions, bottom_n, "asc")
linker._enqueue_sql(sql, "__splink__df_bottom_n")
df_bottom_n = linker._execute_sql_pipeline([df_raw])
bottom_n_rows_all = df_bottom_n.as_record_dict()
else:
bottom_n_rows_all = None
bottom_n_rows = None

inner_charts = []

for expression in column_expressions:
percentile_rows = [
p for p in percentile_rows_all if p["group_name"] == _group_name(expression)
]
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
bottom_n_rows = [
p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression)
]
# remove concat blank from expression title
expression = expression.replace(", ' '", "")

if distribution_plots:
percentile_rows = [
p
for p in percentile_rows_all
if p["group_name"] == _group_name(expression)
]
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
if top_n is not None:
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
if bottom_n is not None:
bottom_n_rows = [
p
for p in bottom_n_rows_all
if p["group_name"] == _group_name(expression)
]
if kde_plots:
kde_rows = [
p for p in kde_rows_all if p["group_name"] == _group_name(expression)
]

inner_chart = _get_inner_chart_spec_freq(
percentile_rows, top_n_rows, bottom_n_rows, expression
percentile_data=percentile_rows,
top_n_data=top_n_rows,
bottom_n_data=bottom_n_rows,
kde_data=kde_rows,
col_name=expression,
)

inner_charts.append(inner_chart)
outer_spec = deepcopy(_outer_chart_spec_freq)

outer_spec = deepcopy(_outer_chart_spec_freq)
outer_spec["vconcat"] = inner_charts

return altair_or_json(outer_spec)
52 changes: 52 additions & 0 deletions tests/test_profile_data.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@
from splink.sqlite.linker import SQLiteLinker

from .basic_settings import get_settings_dict
from .decorator import mark_with_dialects_excluding


def generate_raw_profile_dataset(columns_to_profile, linker):
@@ -176,3 +177,54 @@ def test_profile_using_spark(df_spark):
)

assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0


@mark_with_dialects_excluding()
def test_profile_data(test_helpers, dialect, caplog):
helper = test_helpers[dialect]
settings = get_settings_dict()
Linker = helper.Linker

df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
linker = Linker(df, settings, **helper.extra_linker_args())

# Test original syntax
linker.profile_columns(
["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
top_n=10,
bottom_n=5,
)

# Test new chart specifically
linker.profile_numeric_columns(
["substr(dob, 1,4)"],
top_n=None,
bottom_n=None,
kde_plots=True,
distribution_plots=False,
)

# Test ability to show all elements
linker.profile_numeric_columns(
["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
top_n=10,
bottom_n=5,
kde_plots=True,
distribution_plots=True,
)

# Test error message when user requests 0 elements
linker.profile_numeric_columns(
["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
top_n=None,
bottom_n=None,
kde_plots=False,
distribution_plots=False,
)

captured_logs = caplog.test

assert(
"Warning: No charts in profile_columns have been selected."
in captured_logs
)