From b57902875943c3e36d991f090110bc156dfbe71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Jolivet?= Date: Tue, 21 Nov 2023 14:14:48 +0100 Subject: [PATCH] Create function to test if the polars module is available. Use it to xfail specific tests --- skrub/_dataframe/_namespace.py | 13 +++++++++++++ skrub/tests/test_datetime_encoder.py | 3 ++- skrub/tests/test_fuzzy_join.py | 27 +++++++++++++------------- skrub/tests/test_gap_encoder.py | 3 ++- skrub/tests/test_interpolation_join.py | 23 +++++++++++----------- skrub/tests/test_joiner.py | 5 +++-- skrub/tests/test_similarity_encoder.py | 3 ++- 7 files changed, 48 insertions(+), 29 deletions(-) diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py index 06c65a2ea..d8773cdda 100644 --- a/skrub/_dataframe/_namespace.py +++ b/skrub/_dataframe/_namespace.py @@ -43,6 +43,19 @@ def is_polars(dataframe): return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame)) +def is_namespace_pandas(px): + return px is pd + + +def is_namespace_polars(px): + if "polars" not in sys.modules: + return False + + import polars as pl + + return px is pl + + def get_df_namespace(*dfs): """Get the namespaces of dataframes. diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index 1fdba2bc3..9838995f9 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -7,6 +7,7 @@ from numpy.testing import assert_allclose, assert_array_equal from pandas.testing import assert_frame_equal +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP from skrub._datetime_encoder import ( TIME_LEVELS, @@ -352,7 +353,7 @@ def test_transform_nan(px): @pytest.mark.parametrize("px", MODULES) def test_mixed_type_dataframe(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "to_datetime(X) raises polars.exceptions.ComputeError: cannot cast" diff --git a/skrub/tests/test_fuzzy_join.py b/skrub/tests/test_fuzzy_join.py index 875ebfc13..dc57a440a 100644 --- a/skrub/tests/test_fuzzy_join.py +++ b/skrub/tests/test_fuzzy_join.py @@ -9,6 +9,7 @@ from sklearn.feature_extraction.text import HashingVectorizer from skrub import fuzzy_join +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP MODULES = [pd] @@ -31,7 +32,7 @@ def test_fuzzy_join(px, analyzer: Literal["char", "char_wb", "word"]): """ Testing if fuzzy_join results are as expected. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") df1 = px.DataFrame({"a1": ["ana", "lala", "nana et sana", np.NaN]}) df2 = px.DataFrame({"a2": ["anna", "lala et nana", "lana", "sana", np.NaN]}) @@ -97,7 +98,7 @@ def test_fuzzy_join(px, analyzer: Literal["char", "char_wb", "word"]): @pytest.mark.parametrize("px", MODULES) def test_match_score(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") left = px.DataFrame({"A": ["aa", "bb"]}) right = px.DataFrame({"A": ["aa", "ba"], "B": [1, 2]}) @@ -109,7 +110,7 @@ def test_match_score(px): @pytest.mark.parametrize("px", MODULES) def test_perfect_matches(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") # non-regression test for https://github.com/skrub-data/skrub/issues/764 # fuzzy_join when all rows had a perfect match used to trigger a division by 0 @@ -126,7 +127,7 @@ def test_fuzzy_join_dtypes(px): """ Test that the dtypes of dataframes are maintained after join """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") a = px.DataFrame({"col1": ["aaa", "bbb"], "col2": [1, 2]}) b = px.DataFrame({"col1": ["aaa_", "bbb_"], "col3": [1, 2]}) @@ -171,7 +172,7 @@ def test_parameters_error(px, analyzer, on, how) -> None: @pytest.mark.parametrize("px", MODULES) def test_missing_keys(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") a = px.DataFrame({"col1": ["aaa", "bbb"], "col2": [1, 2]}) b = px.DataFrame({"col1": ["aaa_", "bbb_"], "col3": [1, 2]}) @@ -190,7 +191,7 @@ def test_missing_keys(px): @pytest.mark.parametrize("px", MODULES) def test_drop_unmatched(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") a = px.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]}) b = px.DataFrame({"col1": ["aaa_", "bbb_", "cc ccc"], "col3": [1, 2, 3]}) @@ -214,7 +215,7 @@ def test_how_param(px): Test correct shape of left and right joins. Also test if an error is raised when an incorrect parameter value is passed. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") a = px.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]}) b = px.DataFrame( @@ -290,7 +291,7 @@ def test_correct_encoder(px): """ Test that the encoder error checking is working as intended. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") class TestVectorizer(HashingVectorizer): @@ -337,7 +338,7 @@ def test_numerical_column(px): """ Testing that fuzzy_join works with numerical columns. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") left = px.DataFrame({"str1": ["aa", "a", "bb"], "int": [10, 2, 5]}) right = px.DataFrame( @@ -371,7 +372,7 @@ def test_datetime_column(px, assert_frame_equal_): """ Testing that fuzzy_join works with datetime columns. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Module 'polars' has no attribute 'to_datetime'") left = px.DataFrame( { @@ -423,7 +424,7 @@ def test_mixed_joins(px, assert_frame_equal_): """ Test fuzzy joining on mixed and multiple column types. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Module 'polars' has no attribute 'to_datetime'") left = px.DataFrame( { @@ -569,7 +570,7 @@ def test_iterable_input(px): """ Test if iterable input: list, set, dictionary or tuple works. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") df1 = px.DataFrame( {"a": ["ana", "lala", "nana"], "str2": ["Texas", "France", "Greek God"]} @@ -594,7 +595,7 @@ def test_missing_values(px): """ Test fuzzy joining on missing values. """ - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") a = px.DataFrame({"col1": ["aaaa", "bbb", "ddd dd"], "col2": [1, 2, 3]}) b = px.DataFrame({"col3": [np.NaN, "bbb", "ddd dd"], "col4": [1, 2, 3]}) diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index 6f6afed33..fa2540a26 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -5,6 +5,7 @@ from sklearn.model_selection import train_test_split from skrub import GapEncoder +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP from skrub.datasets import fetch_midwest_survey from skrub.tests.utils import generate_data @@ -222,7 +223,7 @@ def test_score(n_samples: int = 70): ) def test_missing_values(px, missing: str): """Test what happens when missing values are in the data""" - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "'TypeError: '<' not supported between instances of 'DataTypeClass' and" diff --git a/skrub/tests/test_interpolation_join.py b/skrub/tests/test_interpolation_join.py index d29a55e28..c3c4cbd5a 100644 --- a/skrub/tests/test_interpolation_join.py +++ b/skrub/tests/test_interpolation_join.py @@ -6,6 +6,7 @@ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from skrub import InterpolationJoiner +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP MODULES = [pd] @@ -39,7 +40,7 @@ def weather(): @pytest.mark.parametrize("key", [["latitude", "longitude"], "latitude"]) @pytest.mark.parametrize("with_nulls", [False, True]) def test_interpolation_join(px, buildings, weather, key, with_nulls): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -60,7 +61,7 @@ def test_interpolation_join(px, buildings, weather, key, with_nulls): @pytest.mark.parametrize("px", MODULES) def test_vectorizer(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -87,7 +88,7 @@ def transform(self, X): @pytest.mark.parametrize("px", MODULES) def test_no_multioutput(px, buildings, weather): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -105,7 +106,7 @@ def test_no_multioutput(px, buildings, weather): @pytest.mark.parametrize("px", MODULES) def test_condition_choice(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -143,7 +144,7 @@ def test_condition_choice(px): @pytest.mark.parametrize("px", MODULES) def test_suffix(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -159,7 +160,7 @@ def test_suffix(px): @pytest.mark.parametrize("px", MODULES) def test_mismatched_indexes(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -179,7 +180,7 @@ def test_mismatched_indexes(px): @pytest.mark.parametrize("px", MODULES) def test_fit_on_none(px): # X is hardly used in fit so it should be ok to fit without a main table - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -199,7 +200,7 @@ def test_fit_on_none(px): @pytest.mark.parametrize("px", MODULES) def test_join_on_date(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -231,7 +232,7 @@ def fit(self, X, y): @pytest.mark.parametrize("px", MODULES) def test_fit_failures(px, buildings, weather): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -281,7 +282,7 @@ def predict(self, X): @pytest.mark.parametrize("px", MODULES) def test_transform_failures(px, buildings, weather): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" @@ -329,7 +330,7 @@ def test_transform_failures(px, buildings, weather): @pytest.mark.parametrize("px", MODULES) def test_transform_failures_dtype(px, buildings, weather): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail( reason=( "In polars, DataFrame.drop() got an unexpected keyword argument 'axis'" diff --git a/skrub/tests/test_joiner.py b/skrub/tests/test_joiner.py index 0ce6082c8..f73817764 100644 --- a/skrub/tests/test_joiner.py +++ b/skrub/tests/test_joiner.py @@ -4,6 +4,7 @@ from pandas.testing import assert_frame_equal from skrub import Joiner +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP MODULES = [pd] @@ -19,7 +20,7 @@ @pytest.mark.parametrize("px", MODULES) def test_joiner(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") main_table = px.DataFrame( { @@ -65,7 +66,7 @@ def test_joiner(px): @pytest.mark.parametrize("px, assert_frame_equal_", ASSERT_TUPLES) def test_multiple_keys(px, assert_frame_equal_): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Polars DataFrame object has no attribute 'reset_index'") df = px.DataFrame( {"Co": ["France", "Italia", "Deutchland"], "Ca": ["Paris", "Roma", "Berlin"]} diff --git a/skrub/tests/test_similarity_encoder.py b/skrub/tests/test_similarity_encoder.py index a1ff9478c..97a729585 100644 --- a/skrub/tests/test_similarity_encoder.py +++ b/skrub/tests/test_similarity_encoder.py @@ -7,6 +7,7 @@ from sklearn.exceptions import NotFittedError from skrub import SimilarityEncoder +from skrub._dataframe._namespace import is_namespace_polars from skrub._dataframe._polars import POLARS_SETUP from skrub._similarity_encoder import ngram_similarity_matrix from skrub._string_distances import ngram_similarity @@ -341,7 +342,7 @@ def test_check_fitted_super_vectorizer(): @pytest.mark.parametrize("px", MODULES) def test_inverse_transform(px): - if px.__name__ == "polars": + if is_namespace_polars(px): pytest.xfail(reason="Setting output to polars is not possible yet.") encoder = SimilarityEncoder() encoder.set_output(transform="pandas")