moj-analytical-services · sama-ds · Nov 8, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py
@@ -6,8 +6,6 @@
 from .blocking import _sql_gen_where_condition, block_using_rules_sql
 from .misc import calculate_cartesian, calculate_reduction_ratio
 
-import pandas as pd
-
 # https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
 if TYPE_CHECKING:
     from .linker import Linker
@@ -40,7 +38,6 @@ def cumulative_comparisons_generated_by_blocking_rules(
     linker: Linker,
     blocking_rules,
     output_chart=True,
-    return_dataframe=False
 ):
     # Deepcopy our original linker so we can safely adjust our settings.
     # This is particularly important to ensure we don't overwrite our
@@ -141,7 +138,4 @@ def cumulative_comparisons_generated_by_blocking_rules(
 
     linker._analyse_blocking_mode = False
 
-    if return_dataframe: 
-        return pd.DataFrame(br_comparisons)
-    else :
-        return br_comparisons
+    return br_comparisons
diff --git a/splink/exceptions.py b/splink/exceptions.py
@@ -1,5 +1,6 @@
 import warnings
 
+
 # base class for any type of custom exception
 class SplinkException(Exception):
     pass

diff --git a/splink/profile_data.py b/splink/profile_data.py
@@ -1,9 +1,12 @@
+import logging
 import re
 from copy import deepcopy
 
 from .charts import load_chart_definition, vegalite_or_json
 from .misc import ensure_is_list
 
+logger = logging.getLogger(__name__)
+
 
 def _group_name(cols_or_expr):
     cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr)
@@ -230,19 +233,28 @@ def profile_columns(linker, column_expressions, top_n=10, bottom_n=10):
         percentile_rows = [
             p for p in percentile_rows_all if p["group_name"] == _group_name(expression)
         ]
-        percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
-        top_n_rows = [
-            p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
-        ]
-        bottom_n_rows = [
-            p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression)
-        ]
-        # remove concat blank from expression title
-        expression = expression.replace(", ' '", "")
-        inner_chart = _get_inner_chart_spec_freq(
-            percentile_rows, top_n_rows, bottom_n_rows, expression
-        )
-        inner_charts.append(inner_chart)
+        if percentile_rows == []:
+            logger.warning(
+                "Warning: No charts produced for "
+                f"{expression}"
+                " as the column only contains null values."
+            )
+        else:
+            percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
+            top_n_rows = [
+                p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
+            ]
+            bottom_n_rows = [
+                p
+                for p in bottom_n_rows_all
+                if p["group_name"] == _group_name(expression)
+            ]
+            # remove concat blank from expression title
+            expression = expression.replace(", ' '", "")
+            inner_chart = _get_inner_chart_spec_freq(
+                percentile_rows, top_n_rows, bottom_n_rows, expression
+            )
+            inner_charts.append(inner_chart)
     outer_spec = deepcopy(_outer_chart_spec_freq)
 
     outer_spec["vconcat"] = inner_charts

diff --git a/tests/test_profile_data.py b/tests/test_profile_data.py
@@ -176,3 +176,23 @@ def test_profile_using_spark(df_spark):
     )
 
     assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0
+
+
+def test_profile_null_columns(caplog):
+
+    df = pd.DataFrame(
+        [
+            {"unique_id": 1, "test_1": 1, "test_2": None},
+        ]
+    )
+
+    linker = DuckDBLinker(df)
+
+    linker.profile_columns(["test_1", "test_2"])
+
+    captured_logs = caplog.text
+
+    assert (
+        "Warning: No charts produced for test_2 as the column only contains null values."
-    assert (
-        "Warning: No charts produced for test_2 as the column only contains null values."
+assert (
+    "Warning: No charts produced for test_2 as the column only contains "
+    "null values."
+)
-    assert (
-        "Warning: No charts produced for test_2 as the column only contains null values."
+assert (
+    "Warning: No charts produced for test_2 as the column only contains "
+    "null values."
+)
+        in captured_logs
+    )