Skip to content

Commit

Permalink
Fixing CI (#929)
Browse files Browse the repository at this point in the history
Co-authored-by: James Bourbeau <[email protected]>
  • Loading branch information
mmccarty and jrbourbeau authored May 27, 2022
1 parent a5b96ed commit 7b8c8e0
Show file tree
Hide file tree
Showing 20 changed files with 74 additions and 30 deletions.
2 changes: 1 addition & 1 deletion ci/environment-3.7.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ dependencies:
- scikit-learn>=1.0.0
- scipy
- sparse
- toolz
- toolz
2 changes: 1 addition & 1 deletion ci/environment-3.8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ dependencies:
- scikit-learn>=1.0.0
- scipy
- sparse
- toolz
- toolz
2 changes: 1 addition & 1 deletion ci/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ dependencies:
- scikit-learn>=1.0.0
- scipy
- sparse
- toolz
- toolz
1 change: 0 additions & 1 deletion ci/environment-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ dependencies:
- sphinx-gallery
- tornado
- toolz
- xgboost
- zict
- pip
- dask
Expand Down
8 changes: 8 additions & 0 deletions dask_ml/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
PANDAS_1_2_0 = PANDAS_VERSION > packaging.version.parse("1.2.0")
WINDOWS = os.name == "nt"

SKLEARN_1_1_X = SK_VERSION >= packaging.version.parse("1.1")

# 'log_loss' is preferred as of scikit-learn 1.1
if SKLEARN_1_1_X:
SK_LOG_LOSS = "log_loss"
else:
SK_LOG_LOSS = "log"


@contextlib.contextmanager
def dummy_context(*args: Any, **kwargs: Any):
Expand Down
1 change: 0 additions & 1 deletion dask_ml/cluster/k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ def _kmeans_single_lloyd(
labels, distances = pairwise_distances_argmin_min(
X, centers, metric="euclidean", metric_kwargs={"squared": True}
)

labels = labels.astype(np.int32)
# distances is always float64, but we need it to match X.dtype
# for centers_dense, but remain float64 for inertia
Expand Down
5 changes: 4 additions & 1 deletion dask_ml/metrics/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import dask.array as da
import numpy as np
import sklearn.metrics
from dask import is_dask_collection
from dask.utils import derived_from

from .._typing import ArrayLike
Expand All @@ -16,7 +17,9 @@ def _check_sample_weight(sample_weight: Optional[ArrayLike]):
def _check_reg_targets(
y_true: ArrayLike, y_pred: ArrayLike, multioutput: Optional[str]
):
if multioutput is not None and multioutput != "uniform_average":
if multioutput is not None and (
is_dask_collection(multioutput) or multioutput != "uniform_average"
):
raise NotImplementedError("'multioutput' must be 'uniform_average'")

if y_true.ndim == 1:
Expand Down
9 changes: 5 additions & 4 deletions dask_ml/model_selection/_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
from sklearn.metrics import check_scoring
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.utils import check_random_state
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import check_is_fitted

from .._compat import DISTRIBUTED_2021_02_0, annotate, dummy_context
from .._typing import ArrayLike, Int
from .._utils import LoggingContext
from ..wrappers import ParallelPostFit
from ._split import train_test_split
from .utils import estimator_has

logger = logging.getLogger("dask_ml.model_selection")

Expand Down Expand Up @@ -726,17 +727,17 @@ def fit(self, X, y=None, **fit_params):
return client.sync(self._fit, X, y, **fit_params)
return self._fit(X, y, **fit_params)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("decision_function"))
def decision_function(self, X):
self._check_is_fitted("decision_function")
return self.best_estimator_.decision_function(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("transform"))
def transform(self, X):
self._check_is_fitted("transform")
return self.best_estimator_.transform(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("inverse_transform"))
def inverse_transform(self, Xt):
self._check_is_fitted("inverse_transform")
return self.best_estimator_.transform(Xt)
Expand Down
23 changes: 15 additions & 8 deletions dask_ml/model_selection/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.utils._tags import _safe_tags
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.metaestimators import available_if
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import _num_samples, check_is_fitted

Expand All @@ -54,7 +54,14 @@
pipeline,
score,
)
from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip
from .utils import (
DeprecationDict,
estimator_has,
is_dask_collection,
to_indexable,
to_keys,
unzip,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1126,37 +1133,37 @@ def classes_(self):
self._check_is_fitted("classes_")
return self.best_estimator_.classes_

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("predict"))
@derived_from(BaseSearchCV)
def predict(self, X):
self._check_is_fitted("predict")
return self.best_estimator_.predict(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("predict_proba"))
@derived_from(BaseSearchCV)
def predict_proba(self, X):
self._check_is_fitted("predict_proba")
return self.best_estimator_.predict_proba(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("predict_log_proba"))
@derived_from(BaseSearchCV)
def predict_log_proba(self, X):
self._check_is_fitted("predict_log_proba")
return self.best_estimator_.predict_log_proba(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("decision_function"))
@derived_from(BaseSearchCV)
def decision_function(self, X):
self._check_is_fitted("decision_function")
return self.best_estimator_.decision_function(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("transform"))
@derived_from(BaseSearchCV)
def transform(self, X):
self._check_is_fitted("transform")
return self.best_estimator_.transform(X)

@if_delegate_has_method(delegate=("best_estimator_", "estimator"))
@available_if(estimator_has("inverse_transform"))
@derived_from(BaseSearchCV)
def inverse_transform(self, Xt):
self._check_is_fitted("inverse_transform")
Expand Down
10 changes: 10 additions & 0 deletions dask_ml/model_selection/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,16 @@ def copy_estimator(est):
return copy.deepcopy(est)


def estimator_has(attr):
def _check(self):
if hasattr(self, "best_estimator_"):
return hasattr(self.best_estimator_, attr)
if hasattr(self, "estimator"):
return hasattr(self.estimator, attr)

return _check


def unzip(itbl, n):
return zip(*itbl) if itbl else [()] * n

Expand Down
14 changes: 13 additions & 1 deletion dask_ml/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import sklearn.preprocessing

from .._compat import SKLEARN_1_1_X
from .._typing import ArrayLike, DataFrameType, DTypeLike, SeriesType
from ..base import DaskMLBaseMixin
from ..utils import check_array
Expand Down Expand Up @@ -160,6 +161,7 @@ def _fit(
X: Union[ArrayLike, DataFrameType],
handle_unknown: str = "error",
force_all_finite: bool = True,
return_counts=False,
):
X = self._validate_data(
X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True
Expand All @@ -168,8 +170,18 @@ def _fit(
self._check_feature_names(X, reset=True)

if isinstance(X, np.ndarray):
kwargs = {
"handle_unknown": handle_unknown,
"force_all_finite": force_all_finite,
}

# `return_counts` expected as of scikit-learn 1.1
if SKLEARN_1_1_X:
kwargs["return_counts"] = return_counts

return super(OneHotEncoder, self)._fit(
X, handle_unknown=handle_unknown, force_all_finite=force_all_finite
X,
**kwargs,
)

is_array = isinstance(X, da.Array)
Expand Down
6 changes: 3 additions & 3 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@
]

intersphinx_mapping = {
"python": ("https://docs.python.org/3.6", None),
"sklearn": ("http://scikit-learn.org/stable/", None),
"python": ("https://docs.python.org", None),
"sklearn": ("https://scikit-learn.org/stable/", None),
"dask": ("https://docs.dask.org/en/latest/", None),
"distributed": ("https://distributed.dask.org/en/latest/", None),
"dask_glm": ("http://dask-glm.readthedocs.io/en/latest/", None),
"dask_glm": ("https://dask-glm.readthedocs.io/en/latest/", None),
}

numpydoc_class_members_toctree = False
Expand Down
1 change: 1 addition & 0 deletions docs/source/hyper-parameter-search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ This section uses :class:`~dask_ml.model_selection.HyperbandSearchCV`, but it ca
also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too.

.. ipython:: python
:okwarning:
from dask.distributed import Client
from dask_ml.datasets import make_classification
Expand Down
1 change: 1 addition & 0 deletions docs/source/incremental.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ between machines.


.. ipython:: python
:okwarning:
from dask_ml.datasets import make_classification
from dask_ml.wrappers import Incremental
Expand Down
2 changes: 2 additions & 0 deletions docs/source/meta-estimators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ This class is useful for predicting for or transforming large datasets.
We'll make a larger dask array ``X_big`` with 10,000 samples per block.

.. ipython:: python
:okwarning:
X_big, _ = dask_ml.datasets.make_classification(n_samples=100000,
chunks=10000,
Expand All @@ -68,6 +69,7 @@ cause the scheduler to compute tasks in parallel. If you've connected to a
cluster of machines.

.. ipython:: python
:okexcept:
clf.predict_proba(X_big).compute()[:10]
Expand Down
1 change: 1 addition & 0 deletions docs/source/naive-bayes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Example
-------

.. ipython:: python
:okwarning:
from dask_ml import datasets
from dask_ml.naive_bayes import GaussianNB
Expand Down
4 changes: 2 additions & 2 deletions tests/model_selection/test_hyperband.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)
from sklearn.linear_model import SGDClassifier

from dask_ml._compat import DISTRIBUTED_2_5_0
from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
from dask_ml.datasets import make_classification
from dask_ml.model_selection import (
HyperbandSearchCV,
Expand Down Expand Up @@ -53,7 +53,7 @@ async def test_basic(c, s, a, b, array_type, library, max_iter):
X, y = await c.gather(futures)

params = {
"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
"loss": ["hinge", SK_LOG_LOSS, "modified_huber", "squared_hinge", "perceptron"],
"average": [True, False],
"learning_rate": ["constant", "invscaling", "optimal"],
"eta0": np.logspace(-2, 0, num=1000),
Expand Down
6 changes: 3 additions & 3 deletions tests/model_selection/test_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.utils import check_random_state

from dask_ml._compat import DISTRIBUTED_2_5_0
from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
from dask_ml.datasets import make_classification
from dask_ml.model_selection import (
HyperbandSearchCV,
Expand Down Expand Up @@ -249,7 +249,7 @@ async def _test_search_basic(decay_rate, input_type, memory, c, s, a, b):
X, y = pd.DataFrame(X), pd.DataFrame(y)
assert isinstance(X, pd.DataFrame)

model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet")
model = SGDClassifier(tol=1e-3, loss=SK_LOG_LOSS, penalty="elasticnet")

params = {"alpha": np.logspace(-2, 2, 100), "l1_ratio": np.linspace(0.01, 1, 200)}

Expand Down Expand Up @@ -588,7 +588,7 @@ async def test_model_random_determinism(c, s, a, b):
n_samples=n, n_features=d, chunks=n // 10, random_state=0
)
params = {
"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
"loss": ["hinge", SK_LOG_LOSS, "modified_huber", "squared_hinge", "perceptron"],
"average": [True, False],
"learning_rate": ["constant", "invscaling", "optimal"],
"eta0": np.logspace(-2, 0, num=1000),
Expand Down
2 changes: 2 additions & 0 deletions tests/preprocessing/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def test_basic_array(sparse, method, categories):
"active_features_",
"dtypes_",
"drop_idx_",
"infrequent_categories_",
},
)

Expand Down Expand Up @@ -88,6 +89,7 @@ def test_basic_dataframe(sparse, method, dask_data, dtype):
"active_features_",
"dtypes_",
"drop_idx_",
"infrequent_categories_",
},
)

Expand Down
4 changes: 1 addition & 3 deletions tests/test_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ def test_dtypes(self):


def test_dataframes():
df = dd.from_pandas(
pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]}), npartitions=2
)
df = dd.from_pandas(pd.DataFrame(np.random.uniform(size=(100, 4))), npartitions=2)

kmeans = DKKMeans()
kmeans.fit(df)
Expand Down

0 comments on commit 7b8c8e0

Please sign in to comment.