moj-analytical-services · sama-ds · Jun 20, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
diff --git a/996.py b/996.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from splink.duckdb.linker import DuckDBLinker
+from IPython.display import display
+
+df = pd.DataFrame(
+    [
+        {"unique_id": 1, "name": "chris", "test": 1, "test2": 1100, "test3": 0.2},
+        {"unique_id": 2, "name": "chris", "test": 1, "test2": 11100, "test3": 0.3},
+        {"unique_id": 3, "name": "sam", "test": 2, "test2": 1100, "test3": 0.4},
+        {"unique_id": 4, "name": "sam", "test": 2, "test2": 212150, "test3": 0.5},
+        {"unique_id": 5, "name": "sam", "test": 3, "test2": 221150, "test3": 0.6},
+        {"unique_id": 6, "name": "sam", "test": 4, "test2": 175, "test3": 0.7},
+    ]
+)
+
+linker = DuckDBLinker(df)
+
+# linker.debug_mode=True
+
+# Classic
+
+charts = linker.profile_columns(
+    column_expressions=["test", "test2", "test3"],
+    top_n=10,
+    bottom_n=5,
+)
+
+display(charts)
+
+charts2 = linker.profile_columns(
+    column_expressions=["test", "test2", "test3"],
+    top_n=10,
+    bottom_n=None,
+    distribution_plots=False,
+)
+
+display(charts2)
+
+charts3 = linker.profile_numeric_columns(
+    column_expressions=[
+        "test",
+        "test2",
+        "test3",
+    ],
+    top_n=10,
+    bottom_n=5,
+    kde_plots=True,
+)
+
+display(charts3)
+
+# charts4=linker.profile_columns(
+#     ["test", "test2", "test3"],
+#     top_n=None,
+#     bottom_n=None,
+#     distribution_plots=None,
+#     kde_plots=None,
+#     correlation_plot=True
+#     )
+
+# display(charts4)
diff --git a/docs/demos/tutorials/02_Exploratory_analysis.ipynb b/docs/demos/tutorials/02_Exploratory_analysis.ipynb
diff --git a/splink/files/chart_defs/profile_data.json b/splink/files/chart_defs/profile_data.json
@@ -1,80 +1,3 @@
 {
-  "hconcat": [
-    {
-      "data": { "values": null },
-      "mark": { "type": "line", "interpolate": "step-after" },
-      "encoding": {
-        "x": {
-          "type": "quantitative",
-          "field": "percentile_ex_nulls",
-          "sort": "descending",
-          "title": "Percentile"
-        },
-        "y": {
-          "type": "quantitative",
-          "field": "value_count",
-          "title": "Count of values"
-        },
-        "tooltip": [
-          { "field": "value_count", "type": "quantitative" },
-          { "field": "percentile_ex_nulls", "type": "quantitative" },
-          { "field": "percentile_inc_nulls", "type": "quantitative" },
-          { "field": "total_non_null_rows", "type": "quantitative" },
-          { "field": "total_rows_inc_nulls", "type": "quantitative" }
-        ]
-      },
-      "title": {
-        "text": "Distribution of counts of values in column",
-        "subtitle": "Subtitle Text"
-      }
-    },
-    {
-      "data": { "values": null },
-      "mark": "bar",
-      "encoding": {
-        "x": {
-          "type": "nominal",
-          "field": "value",
-          "sort": "-y",
-          "title": null
-        },
-        "y": {
-          "type": "quantitative",
-          "field": "value_count",
-          "title": "Value count"
-        },
-        "tooltip": [
-          { "field": "value", "type": "nominal" },
-          { "field": "value_count", "type": "quantitative" },
-          { "field": "total_non_null_rows", "type": "quantitative" },
-          { "field": "total_rows_inc_nulls", "type": "quantitative" }
-        ]
-      },
-      "title": "Top 20 values by value count"
-    },
-    {
-      "data": { "values": [] },
-      "mark": "bar",
-      "encoding": {
-        "x": {
-          "type": "nominal",
-          "field": "value",
-          "sort": "-y",
-          "title": null
-        },
-        "y": {
-          "type": "quantitative",
-          "field": "value_count",
-          "title": "Value count"
-        },
-        "tooltip": [
-          { "field": "value", "type": "nominal" },
-          { "field": "value_count", "type": "quantitative" },
-          { "field": "total_non_null_rows", "type": "quantitative" },
-          { "field": "total_rows_inc_nulls", "type": "quantitative" }
-        ]
-      },
-      "title": "Bottom 20 values by value count"
-    }
-  ]
+
 }
diff --git a/splink/files/chart_defs/profile_data_bottom_n.json b/splink/files/chart_defs/profile_data_bottom_n.json
@@ -0,0 +1,24 @@
+{
+    "data": { "values": [] },
+    "mark": "bar",
+    "encoding": {
+      "x": {
+        "type": "nominal",
+        "field": "value",
+        "sort": "-y",
+        "title": null
+      },
+      "y": {
+        "type": "quantitative",
+        "field": "value_count",
+        "title": "Value count"
+      },
+      "tooltip": [
+        { "field": "value", "type": "nominal" },
+        { "field": "value_count", "type": "quantitative" },
+        { "field": "total_non_null_rows", "type": "quantitative" },
+        { "field": "total_rows_inc_nulls", "type": "quantitative" }
+      ]
+    },
+    "title": "Bottom 20 values by value count"
+  }
diff --git a/splink/files/chart_defs/profile_data_distribution_plots.json b/splink/files/chart_defs/profile_data_distribution_plots.json
@@ -0,0 +1,28 @@
+{
+    "data": { "values": null },
+    "mark": { "type": "line", "interpolate": "step-after" },
+    "encoding": {
+      "x": {
+        "type": "quantitative",
+        "field": "percentile_ex_nulls",
+        "sort": "descending",
+        "title": "Percentile"
+      },
+      "y": {
+        "type": "quantitative",
+        "field": "value_count",
+        "title": "Count of values"
+      },
+      "tooltip": [
+        { "field": "value_count", "type": "quantitative" },
+        { "field": "percentile_ex_nulls", "type": "quantitative" },
+        { "field": "percentile_inc_nulls", "type": "quantitative" },
+        { "field": "total_non_null_rows", "type": "quantitative" },
+        { "field": "total_rows_inc_nulls", "type": "quantitative" }
+      ]
+    },
+    "title": {
+      "text": "Distribution of counts of values in column",
+      "subtitle": "Subtitle Text"
+    }
+  }
diff --git a/splink/files/chart_defs/profile_data_kde.json b/splink/files/chart_defs/profile_data_kde.json
@@ -0,0 +1,21 @@
+{
+    "data": { "values": [] },
+    "mark": "area",
+    "encoding": {
+      "x": {
+        "type": "nominal",
+        "field": "value",
+        "title": "Value"
+      },
+      "y": {
+        "type": "quantitative",
+        "field": "value_count",
+        "title": "Density"
+      },
+      "tooltip": [
+        { "field": "value", "type": "nominal" },
+        { "field": "value_count", "type": "quantitative" }
+      ]
+    },
+    "title": "Kernel Density Plot"
+}
diff --git a/splink/files/chart_defs/profile_data_top_n.json b/splink/files/chart_defs/profile_data_top_n.json
@@ -0,0 +1,24 @@
+{
+    "data": { "values": null },
+    "mark": "bar",
+    "encoding": {
+      "x": {
+        "type": "nominal",
+        "field": "value",
+        "sort": "-y",
+        "title": null
+      },
+      "y": {
+        "type": "quantitative",
+        "field": "value_count",
+        "title": "Value count"
+      },
+      "tooltip": [
+        { "field": "value", "type": "nominal" },
+        { "field": "value_count", "type": "quantitative" },
+        { "field": "total_non_null_rows", "type": "quantitative" },
+        { "field": "total_rows_inc_nulls", "type": "quantitative" }
+      ]
+    },
+    "title": "Top 20 values by value count"
+  }
diff --git a/splink/linker.py b/splink/linker.py
@@ -2080,69 +2080,36 @@ def cluster_pairwise_predictions_at_threshold(
         return cc
 
     def profile_columns(
-        self, column_expressions: str | list[str] = None, top_n=10, bottom_n=10
+        self,
+        column_expressions: str | list[str],
+        top_n: int = 10,
+        bottom_n: int = 10,
+        distribution_plots: bool = True,
     ):
-        """
-        Profiles the specified columns of the dataframe initiated with the linker.
-
-        This can be computationally expensive if the dataframe is large.
-
-        For the provided columns with column_expressions (or for all columns if
-         left empty) calculate:
-        - A distribution plot that shows the count of values at each percentile.
-        - A top n chart, that produces a chart showing the count of the top n values
-        within the column
-        - A bottom n chart, that produces a chart showing the count of the bottom
-        n values within the column
-
-        This should be used to explore the dataframe, determine if columns have
-        sufficient completeness for linking, analyse the cardinality of columns, and
-        identify the need for standardisation within a given column.
-
-        Args:
-            linker (object): The initiated linker.
-            column_expressions (list, optional): A list of strings containing the
-                specified column names.
-                If left empty this will default to all columns.
-            top_n (int, optional): The number of top n values to plot.
-            bottom_n (int, optional): The number of bottom n values to plot.
-
-        Returns:
-            altair.Chart or dict: A visualization or JSON specification describing the
-            profiling charts.
-
-        Examples:
-            === ":simple-duckdb: DuckDB"
-                ```py
-                linker = DuckDBLinker(df)
-                linker.profile_columns()
-                ```
-            === ":simple-apachespark: Spark"
-                ```py
-                linker = SparkLinker(df)
-                linker.profile_columns()
-                ```
-            === ":simple-amazonaws: Athena"
-                ```py
-                linker = AthenaLinker(df)
-                linker.profile_columns()
-                ```
-            === ":simple-sqlite: SQLite"
-                ```py
-                linker = SQLiteLinker(df)
-                linker.profile_columns()
-                ```
-
-        Note:
-            - The `linker` object should be an instance of the initiated linker.
-            - The provided `column_expressions` can be a list of column names to
-                profile. If left empty, all columns will be profiled.
-            - The `top_n` and `bottom_n` parameters determine the number of top and
-                 bottom values to display in the respective charts.
-        """
+        return profile_columns(
+            self,
+            column_expressions,
+            top_n=top_n,
+            bottom_n=bottom_n,
+            kde_plots=False,
+            distribution_plots=distribution_plots,
+        )
 
+    def profile_numeric_columns(
+        self,
+        column_expressions: str | list[str],
+        top_n: int = 10,
+        bottom_n: int = 10,
+        kde_plots: bool = False,
+        distribution_plots: bool = True,
+    ):
         return profile_columns(
-            self, column_expressions=column_expressions, top_n=top_n, bottom_n=bottom_n
+            self,
+            column_expressions,
+            top_n=top_n,
+            bottom_n=bottom_n,
+            kde_plots=kde_plots,
+            distribution_plots=distribution_plots,
         )
 
     def _get_labels_tablename_from_input(

diff --git a/splink/profile_data.py b/splink/profile_data.py
@@ -1,9 +1,12 @@
+import logging
 import re
 from copy import deepcopy
 
 from .charts import altair_or_json, load_chart_definition
 from .misc import ensure_is_list
 
+logger = logging.getLogger(__name__)
+
 
 def _group_name(cols_or_expr):
     cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr)
@@ -28,48 +31,85 @@ def expressions_to_sql(expressions):
     "$schema": "https://vega.github.io/schema/vega-lite/v5.9.3.json",
 }
 
-chart_path = "profile_data.json"
-_inner_chart_spec_freq = load_chart_definition(chart_path)
+_inner_chart_spec = load_chart_definition("profile_data.json")
+_distribution_plotss_plot = load_chart_definition(
+    "profile_data_distribution_plots.json"
+)
+_top_n_plot = load_chart_definition("profile_data_top_n.json")
+_bottom_n_plot = load_chart_definition("profile_data_bottom_n.json")
+_kde_plot = load_chart_definition("profile_data_kde.json")
+
+
+def _get_inner_chart_spec_freq(
+    col_name,
+    percentile_data=None,
+    top_n_data=None,
+    bottom_n_data=None,
+    kde_data=None,
+):
+
+    inner_spec = deepcopy(_inner_chart_spec)
+    inner_specs = []
+
+    if percentile_data is not None:
+        _distribution_plotss_plot_copy = deepcopy(_distribution_plotss_plot)
+        total_rows_inc_nulls = percentile_data[0]["total_rows_inc_nulls"]
+        total_non_null_rows = percentile_data[0]["total_non_null_rows"]
+        distinct_value_count = percentile_data[0]["distinct_value_count"]
+        perc = total_non_null_rows / total_rows_inc_nulls
+
+        sub = (
+            f"In this col, {total_rows_inc_nulls*(1-perc):,.0f} values "
+            f"({1-perc:,.1%}) are null and there are "
+            f"{distinct_value_count} distinct values"
+        )
+        sub = sub.format(**percentile_data[0])
+        _distribution_plotss_plot_copy["data"]["values"] = percentile_data
+        _distribution_plotss_plot_copy["title"][
+            "text"
+        ] = f"Distribution of counts of values in column {col_name}"
+
+        _distribution_plotss_plot_copy["title"]["subtitle"] = sub
 
+        inner_specs.append(_distribution_plotss_plot_copy)
 
-def _get_inner_chart_spec_freq(percentile_data, top_n_data, bottom_n_data, col_name):
-    inner_spec = deepcopy(_inner_chart_spec_freq)
+    if top_n_data is not None:
+        _top_n_plot_copy = deepcopy(_top_n_plot)
+        _top_n_plot_copy["data"]["values"] = top_n_data
+        _top_n_plot_copy["title"] = f"Top {len(top_n_data)} values by value count"
 
-    total_rows_inc_nulls = percentile_data[0]["total_rows_inc_nulls"]
-    total_non_null_rows = percentile_data[0]["total_non_null_rows"]
-    distinct_value_count = percentile_data[0]["distinct_value_count"]
-    perc = total_non_null_rows / total_rows_inc_nulls
+        inner_specs.append(_top_n_plot_copy)
 
-    sub = (
-        f"In this col, {total_rows_inc_nulls*(1-perc):,.0f} values "
-        f"({1-perc:,.1%}) are null and there are "
-        f"{distinct_value_count} distinct values"
-    )
-    sub = sub.format(**percentile_data[0])
-    inner_spec["hconcat"][0]["data"]["values"] = percentile_data
-    inner_spec["hconcat"][0]["title"][
-        "text"
-    ] = f"Distribution of counts of values in column {col_name}"
+    if bottom_n_data is not None:
+        _bottom_n_plot_copy = deepcopy(_bottom_n_plot)
+        _bottom_n_plot_copy["data"]["values"] = bottom_n_data
+        _bottom_n_plot_copy[
+            "title"
+        ] = f"Bottom {len(bottom_n_data)} values by value count"
 
-    inner_spec["hconcat"][0]["title"]["subtitle"] = sub
+        max_val = top_n_data[0]["value_count"]
+        _bottom_n_plot_copy["encoding"]["y"]["scale"] = {"domain": [0, max_val]}
 
-    inner_spec["hconcat"][1]["data"]["values"] = top_n_data
-    inner_spec["hconcat"][1]["title"] = f"Top {len(top_n_data)} values by value count"
+        inner_specs.append(_bottom_n_plot_copy)
 
-    inner_spec["hconcat"][2]["data"]["values"] = bottom_n_data
-    inner_spec["hconcat"][2][
-        "title"
-    ] = f"Bottom {len(bottom_n_data)} values by value count"
+    if kde_data is not None:
+        _kde_plot_copy = deepcopy(_kde_plot)
+        _kde_plot_copy["data"]["values"] = kde_data
+        _kde_plot_copy["title"] = "Kernel Density Estimation"
+        _kde_plot_copy["mark"] = "area"
+        _kde_plot_copy["encoding"]["x"]["field"] = "value"
+        _kde_plot_copy["encoding"]["y"]["field"] = "value_count"
 
-    max_val = top_n_data[0]["value_count"]
-    inner_spec["hconcat"][2]["encoding"]["y"]["scale"] = {"domain": [0, max_val]}
+        inner_specs.append(_kde_plot_copy)
+
+    inner_spec["hconcat"] = inner_specs
 
     return inner_spec
 
 
 def _get_df_percentiles():
     """Take __splink__df_all_column_value_frequencies and
-    turn it into the raw data needed for the percentile cahrt
+    turn it into the raw data needed for the percentile chart
     """
 
     sqls = []
@@ -117,6 +157,17 @@ def _get_df_percentiles():
     return sqls
 
 
+def _get_df_kde():
+    sql = """
+    select
+        value,
+        value_count,
+        group_name,
+    from __splink__df_all_column_value_frequencies
+    """
+    return sql
+
+
 def _get_df_top_bottom_n(expressions, limit=20, value_order="desc"):
     sql = """
     select * from
@@ -190,46 +241,60 @@ def _add_100_percentile_to_df_percentiles(percentile_rows):
     return percentile_rows
 
 
-def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
+def profile_columns(
+    linker,
+    column_expressions: str,
+    top_n: int = 10,
+    bottom_n: int = 10,
+    distribution_plots: bool = True,
+    kde_plots: bool = False,
+):
     """
     Profiles the specified columns of the dataframe initiated with the linker.
-
     This can be computationally expensive if the dataframe is large.
-
     For the provided columns with column_expressions (or for all columns if left empty)
-    calculate:
+    we can calculate:
     - A distribution plot that shows the count of values at each percentile.
     - A top n chart, that produces a chart showing the count of the top n values
     within the column
-    - A bottom n chart, that produces a chart showing the count of the bottom
-    n values within the column
+    - A bottom n chart, that produces a chart showing the count of the bottom n values
+    within the column
+    - A kernel density plot of a numeric variable.
 
-    This should be used to explore the dataframe, determine if columns have
-    sufficient completeness for linking, analyse the cardinality of columns, and
-    identify the need for standardisation within a given column.
+    This should be used to explore the dataframe, determine if columns have sufficient
+    completeness for linking, analyse the cardinality of columns, and identify the need
+    for standardisation within a given column.
 
     Args:
         linker (object): The initiated linker.
-        column_expressions (list, optional): A list of strings containing the
-            specified column names.
-            If left empty this will default to all columns.
+        column_expressions (list, optional): A list of strings containing the specified
+        column names. If left empty this will default to all columns.
         top_n (int, optional): The number of top n values to plot.
+        Will default to 10, but if set to None the chart will not be
+        produced.
         bottom_n (int, optional): The number of bottom n values to plot.
-
+        Will default to 10, but if set to None the chart will not be produced.
+        kde_plot (bool, optional): A boolean value indicating whether kde plots should be
+        produced.
+        distribution_plots (bool, optional): A boolean value indicating whether
+        distribution plots should be produced.
     Returns:
         altair.Chart or dict: A visualization or JSON specification describing the
-         profiling charts.
-
+        profiling charts.
     Note:
         - The `linker` object should be an instance of the initiated linker.
         - The provided `column_expressions` can be a list of column names to profile.
-            If left empty, all columns will be profiled.
+        If left empty, all columns will be profiled.
         - The `top_n` and `bottom_n` parameters determine the number of top and bottom
-            values to display in the respective charts.
+        values to display in the respective charts.
     """
 
-    if not column_expressions:
-        column_expressions = linker._get_input_columns
+    if top_n is None and bottom_n is None and not distribution_plots and not kde_plots:
+        logger.warning(
+            "Warning: No charts in profile_columns have been selected."
+        )
+        return None
+
 
     df_concat = linker._initialise_df_concat()
 
@@ -243,48 +308,83 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
     sql = _col_or_expr_frequencies_raw_data_sql(
         column_expressions_raw, "__splink__df_concat"
     )
-
     linker._enqueue_sql(sql, "__splink__df_all_column_value_frequencies")
     df_raw = linker._execute_sql_pipeline(input_dataframes)
 
-    sqls = _get_df_percentiles()
-    for sql in sqls:
-        linker._enqueue_sql(sql["sql"], sql["output_table_name"])
-
-    df_percentiles = linker._execute_sql_pipeline([df_raw])
-    percentile_rows_all = df_percentiles.as_record_dict()
-
-    sql = _get_df_top_bottom_n(column_expressions, top_n, "desc")
-    linker._enqueue_sql(sql, "__splink__df_top_n")
-    df_top_n = linker._execute_sql_pipeline([df_raw])
-    top_n_rows_all = df_top_n.as_record_dict()
-
-    sql = _get_df_top_bottom_n(column_expressions, bottom_n, "asc")
-    linker._enqueue_sql(sql, "__splink__df_bottom_n")
-    df_bottom_n = linker._execute_sql_pipeline([df_raw])
-    bottom_n_rows_all = df_bottom_n.as_record_dict()
+    if distribution_plots:
+        sqls = _get_df_percentiles()
+        for sql in sqls:
+            linker._enqueue_sql(sql["sql"], sql["output_table_name"])
+        df_percentiles = linker._execute_sql_pipeline([df_raw])
+        percentile_rows_all = df_percentiles.as_record_dict()
+    else:
+        percentile_rows_all = None
+        percentile_rows = None
+
+    if top_n is not None:
+        sql = _get_df_top_bottom_n(column_expressions, top_n, "desc")
+        linker._enqueue_sql(sql, "__splink__df_top_n")
+        df_top_n = linker._execute_sql_pipeline([df_raw])
+        top_n_rows_all = df_top_n.as_record_dict()
+    else:
+        top_n_rows_all = None
+        top_n_rows = None
+
+    if kde_plots:
+        sql = _get_df_kde()
+        linker._enqueue_sql(sql, "__splink__df_kde")
+        df_kde = linker._execute_sql_pipeline([df_raw])
+        kde_rows_all = df_kde.as_record_dict()
+    else:
+        kde_rows_all = None
+        kde_rows = None
+
+    if bottom_n is not None:
+        sql = _get_df_top_bottom_n(column_expressions, bottom_n, "asc")
+        linker._enqueue_sql(sql, "__splink__df_bottom_n")
+        df_bottom_n = linker._execute_sql_pipeline([df_raw])
+        bottom_n_rows_all = df_bottom_n.as_record_dict()
+    else:
+        bottom_n_rows_all = None
+        bottom_n_rows = None
 
     inner_charts = []
 
     for expression in column_expressions:
-        percentile_rows = [
-            p for p in percentile_rows_all if p["group_name"] == _group_name(expression)
-        ]
-        percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
-        top_n_rows = [
-            p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
-        ]
-        bottom_n_rows = [
-            p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression)
-        ]
-        # remove concat blank from expression title
-        expression = expression.replace(", ' '", "")
+
+        if distribution_plots:
+            percentile_rows = [
+                p
+                for p in percentile_rows_all
+                if p["group_name"] == _group_name(expression)
+            ]
+            percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
+        if top_n is not None:
+            top_n_rows = [
+                p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
+            ]
+        if bottom_n is not None:
+            bottom_n_rows = [
+                p
+                for p in bottom_n_rows_all
+                if p["group_name"] == _group_name(expression)
+            ]
+        if kde_plots:
+            kde_rows = [
+                p for p in kde_rows_all if p["group_name"] == _group_name(expression)
+            ]
+
         inner_chart = _get_inner_chart_spec_freq(
-            percentile_rows, top_n_rows, bottom_n_rows, expression
+            percentile_data=percentile_rows,
+            top_n_data=top_n_rows,
+            bottom_n_data=bottom_n_rows,
+            kde_data=kde_rows,
+            col_name=expression,
         )
+
         inner_charts.append(inner_chart)
-    outer_spec = deepcopy(_outer_chart_spec_freq)
 
+    outer_spec = deepcopy(_outer_chart_spec_freq)
     outer_spec["vconcat"] = inner_charts
 
     return altair_or_json(outer_spec)
diff --git a/tests/test_profile_data.py b/tests/test_profile_data.py
@@ -14,6 +14,7 @@
 from splink.sqlite.linker import SQLiteLinker
 
 from .basic_settings import get_settings_dict
+from .decorator import mark_with_dialects_excluding
 
 
 def generate_raw_profile_dataset(columns_to_profile, linker):
@@ -176,3 +177,54 @@ def test_profile_using_spark(df_spark):
     )
 
     assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0
+
+
+@mark_with_dialects_excluding()
+def test_profile_data(test_helpers, dialect, caplog):
+    helper = test_helpers[dialect]
+    settings = get_settings_dict()
+    Linker = helper.Linker
+
+    df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
+    linker = Linker(df, settings, **helper.extra_linker_args())
+
+    # Test original syntax
+    linker.profile_columns(
+        ["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
+        top_n=10,
+        bottom_n=5,
+    )
+
+    # Test new chart specifically
+    linker.profile_numeric_columns(
+        ["substr(dob, 1,4)"],
+        top_n=None,
+        bottom_n=None,
+        kde_plots=True,
+        distribution_plots=False,
+    )
+
+    # Test ability to show all elements
+    linker.profile_numeric_columns(
+        ["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
+        top_n=10,
+        bottom_n=5,
+        kde_plots=True,
+        distribution_plots=True,
+    )
+
+    # Test error message when user requests 0 elements
+    linker.profile_numeric_columns(
+        ["first_name", "city", "surname", "email", "substr(dob, 1,4)"],
+        top_n=None,
+        bottom_n=None,
+        kde_plots=False,
+        distribution_plots=False,
+    )
+
+    captured_logs = caplog.test
+
+    assert(
+        "Warning: No charts in profile_columns have been selected." 
+        in captured_logs
+    )