Skip to content

Commit

Permalink
Support Pandas future.infer_string=True in report generation
Browse files Browse the repository at this point in the history
Previously, report generation encountered issues when
`future.infer_string=True` was set. This resulted in multiple warnings
("FutureWarning: Dtype inference on a pandas object is deprecated") and
failures when string columns contained only empty strings
("AttributeError: 'StringDtype' object has no attribute
'pyarrow_dtype'").
This change resolves the issue by explicitly setting the dtype to
"object" for the relevant operations.
  • Loading branch information
ssiegel committed Nov 7, 2024
1 parent 3b91f87 commit 0364724
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


def get_character_counts_vc(vc: pd.Series) -> pd.Series:
series = pd.Series(vc.index, index=vc)
series = pd.Series(vc.index, index=vc, dtype=object)
characters = series[series != ""].apply(list)
characters = characters.explode()

Expand Down Expand Up @@ -169,7 +169,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
# TODO: configurable lowercase/punctuation etc.
# TODO: remove punctuation in words

series = pd.Series(vc.index, index=vc)
series = pd.Series(vc.index, index=vc, dtype=object)
word_lists = series.str.lower().str.split()
words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
word_counts = pd.Series(words.index, index=words)
Expand All @@ -187,7 +187,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:


def length_summary_vc(vc: pd.Series) -> dict:
series = pd.Series(vc.index, index=vc)
series = pd.Series(vc.index, index=vc, dtype=object)
length = series.str.len()
length_counts = pd.Series(length.index, index=length)
length_counts = length_counts.groupby(level=0, sort=False).sum()
Expand Down
3 changes: 2 additions & 1 deletion src/ydata_profiling/model/pandas/summary_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def pandas_describe_1d(
"""

# Make sure pd.NA is not in the series
series = series.fillna(np.nan)
with pd.option_context("future.no_silent_downcasting", True):
series = series.fillna(np.nan)

if (
isinstance(typeset, ProfilingTypeSet)
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/test_pd_future_infer_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd
import pytest

from ydata_profiling import ProfileReport


@pytest.fixture()
def df():
df = pd.DataFrame(
{
"foo": [1, 2, 3],
"bar": ["", "", ""],
}
)
return df


def test_pd_future_infer_string(df: pd.DataFrame):
with pd.option_context("future.infer_string", True):
profile_report = ProfileReport(df, title="Test Report", progress_bar=False)
assert len(profile_report.to_html()) > 0

0 comments on commit 0364724

Please sign in to comment.