From f39f669d941b1c8bb5323c7f003bac3536c85d9b Mon Sep 17 00:00:00 2001 From: Stefan Siegel Date: Thu, 7 Nov 2024 15:47:19 +0100 Subject: [PATCH] fix: Avoid failure when index level shares name with a column Previously, report generation failed for DataFrames where an index level had the same name as a column, resulting in a "ValueError: 'foo' is both an index level and a column label, which is ambiguous." This update removes index names for the relevant groupby operation, ensuring the column is prioritized. --- .../model/pandas/duplicates_pandas.py | 1 + tests/unit/test_index_column_name_clash.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/unit/test_index_column_name_clash.py diff --git a/src/ydata_profiling/model/pandas/duplicates_pandas.py b/src/ydata_profiling/model/pandas/duplicates_pandas.py index 3f6fa1ca7..2923a643f 100644 --- a/src/ydata_profiling/model/pandas/duplicates_pandas.py +++ b/src/ydata_profiling/model/pandas/duplicates_pandas.py @@ -35,6 +35,7 @@ def pandas_get_duplicates( duplicated_rows = df.duplicated(subset=supported_columns, keep=False) duplicated_rows = ( df[duplicated_rows] + .rename_axis(index=lambda _: None) .groupby(supported_columns, dropna=False, observed=True) .size() .reset_index(name=duplicates_key) diff --git a/tests/unit/test_index_column_name_clash.py b/tests/unit/test_index_column_name_clash.py new file mode 100644 index 000000000..0b149a3f6 --- /dev/null +++ b/tests/unit/test_index_column_name_clash.py @@ -0,0 +1,20 @@ +import pandas as pd +import pytest + +from ydata_profiling import ProfileReport + + +@pytest.fixture() +def df(): + df = pd.DataFrame( + { + "foo": [1, 2, 3], + }, + index=pd.Index([1, 2, 3], name="foo"), + ) + return df + + +def test_index_column_name_clash(df: pd.DataFrame): + profile_report = ProfileReport(df, title="Test Report", progress_bar=False) + assert len(profile_report.to_html()) > 0