Fixing CI (#929)

Co-authored-by: James Bourbeau <[email protected]>
dask · May 27, 2022 · 7b8c8e0 · 7b8c8e0
1 parent a5b96ed
commit 7b8c8e0
Show file tree

Hide file tree

Showing 20 changed files with 74 additions and 30 deletions.
diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml
@@ -20,4 +20,4 @@ dependencies:
   - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - toolz
+  - toolz
diff --git a/ci/environment-3.8.yaml b/ci/environment-3.8.yaml
@@ -20,4 +20,4 @@ dependencies:
   - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - toolz
+  - toolz
diff --git a/ci/environment-3.9.yaml b/ci/environment-3.9.yaml
@@ -20,4 +20,4 @@ dependencies:
   - scikit-learn>=1.0.0
   - scipy
   - sparse
-  - toolz
+  - toolz
diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml
@@ -29,7 +29,6 @@ dependencies:
   - sphinx-gallery
   - tornado
   - toolz
-  - xgboost
   - zict
   - pip
   - dask

diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py
@@ -29,6 +29,14 @@
 PANDAS_1_2_0 = PANDAS_VERSION > packaging.version.parse("1.2.0")
 WINDOWS = os.name == "nt"
 
+SKLEARN_1_1_X = SK_VERSION >= packaging.version.parse("1.1")
+
+# 'log_loss' is preferred as of scikit-learn 1.1
+if SKLEARN_1_1_X:
+    SK_LOG_LOSS = "log_loss"
+else:
+    SK_LOG_LOSS = "log"
+
 
 @contextlib.contextmanager
 def dummy_context(*args: Any, **kwargs: Any):

diff --git a/dask_ml/cluster/k_means.py b/dask_ml/cluster/k_means.py
@@ -541,7 +541,6 @@ def _kmeans_single_lloyd(
             labels, distances = pairwise_distances_argmin_min(
                 X, centers, metric="euclidean", metric_kwargs={"squared": True}
             )
-
             labels = labels.astype(np.int32)
             # distances is always float64, but we need it to match X.dtype
             # for centers_dense, but remain float64 for inertia

diff --git a/dask_ml/metrics/regression.py b/dask_ml/metrics/regression.py
@@ -3,6 +3,7 @@
 import dask.array as da
 import numpy as np
 import sklearn.metrics
+from dask import is_dask_collection
 from dask.utils import derived_from
 
 from .._typing import ArrayLike
@@ -16,7 +17,9 @@ def _check_sample_weight(sample_weight: Optional[ArrayLike]):
 def _check_reg_targets(
     y_true: ArrayLike, y_pred: ArrayLike, multioutput: Optional[str]
 ):
-    if multioutput is not None and multioutput != "uniform_average":
+    if multioutput is not None and (
+        is_dask_collection(multioutput) or multioutput != "uniform_average"
+    ):
         raise NotImplementedError("'multioutput' must be 'uniform_average'")
 
     if y_true.ndim == 1:

diff --git a/dask_ml/model_selection/_incremental.py b/dask_ml/model_selection/_incremental.py
@@ -23,14 +23,15 @@
 from sklearn.metrics import check_scoring
 from sklearn.model_selection import ParameterGrid, ParameterSampler
 from sklearn.utils import check_random_state
-from sklearn.utils.metaestimators import if_delegate_has_method
+from sklearn.utils.metaestimators import available_if
 from sklearn.utils.validation import check_is_fitted
 
 from .._compat import DISTRIBUTED_2021_02_0, annotate, dummy_context
 from .._typing import ArrayLike, Int
 from .._utils import LoggingContext
 from ..wrappers import ParallelPostFit
 from ._split import train_test_split
+from .utils import estimator_has
 
 logger = logging.getLogger("dask_ml.model_selection")
 
@@ -726,17 +727,17 @@ def fit(self, X, y=None, **fit_params):
             return client.sync(self._fit, X, y, **fit_params)
         return self._fit(X, y, **fit_params)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("decision_function"))
     def decision_function(self, X):
         self._check_is_fitted("decision_function")
         return self.best_estimator_.decision_function(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("transform"))
     def transform(self, X):
         self._check_is_fitted("transform")
         return self.best_estimator_.transform(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("inverse_transform"))
     def inverse_transform(self, Xt):
         self._check_is_fitted("inverse_transform")
         return self.best_estimator_.transform(Xt)

diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py
@@ -32,7 +32,7 @@
 )
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.utils._tags import _safe_tags
-from sklearn.utils.metaestimators import if_delegate_has_method
+from sklearn.utils.metaestimators import available_if
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _num_samples, check_is_fitted
 
@@ -54,7 +54,14 @@
     pipeline,
     score,
 )
-from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip
+from .utils import (
+    DeprecationDict,
+    estimator_has,
+    is_dask_collection,
+    to_indexable,
+    to_keys,
+    unzip,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -1126,37 +1133,37 @@ def classes_(self):
         self._check_is_fitted("classes_")
         return self.best_estimator_.classes_
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("predict"))
     @derived_from(BaseSearchCV)
     def predict(self, X):
         self._check_is_fitted("predict")
         return self.best_estimator_.predict(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("predict_proba"))
     @derived_from(BaseSearchCV)
     def predict_proba(self, X):
         self._check_is_fitted("predict_proba")
         return self.best_estimator_.predict_proba(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("predict_log_proba"))
     @derived_from(BaseSearchCV)
     def predict_log_proba(self, X):
         self._check_is_fitted("predict_log_proba")
         return self.best_estimator_.predict_log_proba(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("decision_function"))
     @derived_from(BaseSearchCV)
     def decision_function(self, X):
         self._check_is_fitted("decision_function")
         return self.best_estimator_.decision_function(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("transform"))
     @derived_from(BaseSearchCV)
     def transform(self, X):
         self._check_is_fitted("transform")
         return self.best_estimator_.transform(X)
 
-    @if_delegate_has_method(delegate=("best_estimator_", "estimator"))
+    @available_if(estimator_has("inverse_transform"))
     @derived_from(BaseSearchCV)
     def inverse_transform(self, Xt):
         self._check_is_fitted("inverse_transform")

diff --git a/dask_ml/model_selection/utils.py b/dask_ml/model_selection/utils.py
@@ -97,6 +97,16 @@ def copy_estimator(est):
     return copy.deepcopy(est)
 
 
+def estimator_has(attr):
+    def _check(self):
+        if hasattr(self, "best_estimator_"):
+            return hasattr(self.best_estimator_, attr)
+        if hasattr(self, "estimator"):
+            return hasattr(self.estimator, attr)
+
+    return _check
+
+
 def unzip(itbl, n):
     return zip(*itbl) if itbl else [()] * n
 

diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import sklearn.preprocessing
 
+from .._compat import SKLEARN_1_1_X
 from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType
 from ..base import DaskMLBaseMixin
 from ..utils import check_array
@@ -160,6 +161,7 @@ def _fit(
         X: Union[ArrayLike, DataFrameType],
         handle_unknown: str = "error",
         force_all_finite: bool = True,
+        return_counts=False,
     ):
         X = self._validate_data(
             X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True
@@ -168,8 +170,18 @@ def _fit(
         self._check_feature_names(X, reset=True)
 
         if isinstance(X, np.ndarray):
+            kwargs = {
+                "handle_unknown": handle_unknown,
+                "force_all_finite": force_all_finite,
+            }
+
+            # `return_counts` expected as of scikit-learn 1.1
+            if SKLEARN_1_1_X:
+                kwargs["return_counts"] = return_counts
+
             return super(OneHotEncoder, self)._fit(
-                X, handle_unknown=handle_unknown, force_all_finite=force_all_finite
+                X,
+                **kwargs,
             )
 
         is_array = isinstance(X, da.Array)

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -46,11 +46,11 @@
 ]
 
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3.6", None),
-    "sklearn": ("http://scikit-learn.org/stable/", None),
+    "python": ("https://docs.python.org", None),
+    "sklearn": ("https://scikit-learn.org/stable/", None),
     "dask": ("https://docs.dask.org/en/latest/", None),
     "distributed": ("https://distributed.dask.org/en/latest/", None),
-    "dask_glm": ("http://dask-glm.readthedocs.io/en/latest/", None),
+    "dask_glm": ("https://dask-glm.readthedocs.io/en/latest/", None),
 }
 
 numpydoc_class_members_toctree = False

diff --git a/docs/source/hyper-parameter-search.rst b/docs/source/hyper-parameter-search.rst
@@ -420,6 +420,7 @@ This section uses :class:`~dask_ml.model_selection.HyperbandSearchCV`, but it ca
 also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too.
 
 .. ipython:: python
+   :okwarning:
 
     from dask.distributed import Client
     from dask_ml.datasets import make_classification

diff --git a/docs/source/incremental.rst b/docs/source/incremental.rst
@@ -48,6 +48,7 @@ between machines.
 
 
 .. ipython:: python
+   :okwarning:
 
    from dask_ml.datasets import make_classification
    from dask_ml.wrappers import Incremental

diff --git a/docs/source/meta-estimators.rst b/docs/source/meta-estimators.rst
@@ -56,6 +56,7 @@ This class is useful for predicting for or transforming large datasets.
 We'll make a larger dask array ``X_big`` with 10,000 samples per block.
 
 .. ipython:: python
+   :okwarning:
 
    X_big, _ = dask_ml.datasets.make_classification(n_samples=100000,
                                                    chunks=10000,
@@ -68,6 +69,7 @@ cause the scheduler to compute tasks in parallel. If you've connected to a
 cluster of machines.
 
 .. ipython:: python
+   :okexcept:
 
    clf.predict_proba(X_big).compute()[:10]
 

diff --git a/docs/source/naive-bayes.rst b/docs/source/naive-bayes.rst
@@ -19,6 +19,7 @@ Example
 -------
 
 .. ipython:: python
+   :okwarning:
 
    from dask_ml import datasets
    from dask_ml.naive_bayes import GaussianNB

diff --git a/tests/model_selection/test_hyperband.py b/tests/model_selection/test_hyperband.py
@@ -15,7 +15,7 @@
 )
 from sklearn.linear_model import SGDClassifier
 
-from dask_ml._compat import DISTRIBUTED_2_5_0
+from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
 from dask_ml.datasets import make_classification
 from dask_ml.model_selection import (
     HyperbandSearchCV,
@@ -53,7 +53,7 @@ async def test_basic(c, s, a, b, array_type, library, max_iter):
         X, y = await c.gather(futures)
 
     params = {
-        "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
+        "loss": ["hinge", SK_LOG_LOSS, "modified_huber", "squared_hinge", "perceptron"],
         "average": [True, False],
         "learning_rate": ["constant", "invscaling", "optimal"],
         "eta0": np.logspace(-2, 0, num=1000),

diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
@@ -27,7 +27,7 @@
 from sklearn.model_selection import ParameterGrid, ParameterSampler
 from sklearn.utils import check_random_state
 
-from dask_ml._compat import DISTRIBUTED_2_5_0
+from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
 from dask_ml.datasets import make_classification
 from dask_ml.model_selection import (
     HyperbandSearchCV,
@@ -249,7 +249,7 @@ async def _test_search_basic(decay_rate, input_type, memory, c, s, a, b):
             X, y = pd.DataFrame(X), pd.DataFrame(y)
             assert isinstance(X, pd.DataFrame)
 
-    model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet")
+    model = SGDClassifier(tol=1e-3, loss=SK_LOG_LOSS, penalty="elasticnet")
 
     params = {"alpha": np.logspace(-2, 2, 100), "l1_ratio": np.linspace(0.01, 1, 200)}
 
@@ -588,7 +588,7 @@ async def test_model_random_determinism(c, s, a, b):
         n_samples=n, n_features=d, chunks=n // 10, random_state=0
     )
     params = {
-        "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
+        "loss": ["hinge", SK_LOG_LOSS, "modified_huber", "squared_hinge", "perceptron"],
         "average": [True, False],
         "learning_rate": ["constant", "invscaling", "optimal"],
         "eta0": np.logspace(-2, 0, num=1000),

diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
@@ -43,6 +43,7 @@ def test_basic_array(sparse, method, categories):
             "active_features_",
             "dtypes_",
             "drop_idx_",
+            "infrequent_categories_",
         },
     )
 
@@ -88,6 +89,7 @@ def test_basic_dataframe(sparse, method, dask_data, dtype):
             "active_features_",
             "dtypes_",
             "drop_idx_",
+            "infrequent_categories_",
         },
     )
 

diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py
@@ -179,9 +179,7 @@ def test_dtypes(self):
 
 
 def test_dataframes():
-    df = dd.from_pandas(
-        pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]}), npartitions=2
-    )
+    df = dd.from_pandas(pd.DataFrame(np.random.uniform(size=(100, 4))), npartitions=2)
 
     kmeans = DKKMeans()
     kmeans.fit(df)
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,4 +20,4 @@ dependencies: @@
       - scikit-learn>=1.0.0
       - scipy
       - sparse
-      - toolz
+      - toolz