From 62cb5727916560714ddae0113aaa8ff489aff713 Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 3 Nov 2017 11:06:39 +0800 Subject: [PATCH 1/7] add joblib parallel for eli5.permutation_importance.get_score_importances --- eli5/permutation_importance.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index a84a85cc..31f0bac8 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -15,6 +15,7 @@ import numpy as np # type: ignore from sklearn.utils import check_random_state # type: ignore +from sklearn.externals.joblib import Parallel, delayed def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, @@ -58,7 +59,8 @@ def get_score_importances( y, n_iter=5, # type: int columns_to_shuffle=None, - random_state=None + random_state=None, + n_jobs=1 ): # type: (...) -> Tuple[float, List[np.ndarray]] """ @@ -84,12 +86,13 @@ def get_score_importances( """ rng = check_random_state(random_state) base_score = score_func(X, y) + parallel = Parallel(n_jobs=n_jobs) + result = parallel(delayed(_get_scores_shufled)( + score_func, X, y, columns_to_shuffle=columns_to_shuffle, + random_state=rng + ) for _ in range(n_iter)) scores_decreases = [] - for i in range(n_iter): - scores_shuffled = _get_scores_shufled( - score_func, X, y, columns_to_shuffle=columns_to_shuffle, - random_state=rng - ) + for scores_shuffled in result: scores_decreases.append(-scores_shuffled + base_score) return base_score, scores_decreases From 1ceb4c60d0caafedde5f6686c138b02dd6d2626b Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 3 Nov 2017 14:49:11 +0800 Subject: [PATCH 2/7] fix parallel seed problem --- eli5/permutation_importance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index 31f0bac8..17d5c794 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -87,10 +87,11 @@ def get_score_importances( rng = check_random_state(random_state) base_score = score_func(X, y) parallel = Parallel(n_jobs=n_jobs) + seed0 = rng.randint(2**32) result = parallel(delayed(_get_scores_shufled)( score_func, X, y, columns_to_shuffle=columns_to_shuffle, - random_state=rng - ) for _ in range(n_iter)) + random_state=np.random.RandomState(seed0+i) + ) for i in range(n_iter)) scores_decreases = [] for scores_shuffled in result: scores_decreases.append(-scores_shuffled + base_score) From f21628b6284a561c031e361e213d552dc7cf56e4 Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 3 Nov 2017 15:02:13 +0800 Subject: [PATCH 3/7] add n_jobs for sklearn permutation_importance --- eli5/sklearn/permutation_importance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index f3aa091d..613d2ebb 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -114,6 +114,8 @@ class PermutationImportance(BaseEstimator, MetaEstimatorMixin): Whether to fit the estimator on the whole data if cross-validation is used (default is False). + n_jobs : int, number of parallel jobs for shuffle iterations + Attributes ---------- feature_importances_ : array @@ -139,7 +141,7 @@ class PermutationImportance(BaseEstimator, MetaEstimatorMixin): random state """ def __init__(self, estimator, scoring=None, n_iter=5, random_state=None, - cv='prefit', refit=True): + cv='prefit', refit=True, n_jobs=1): # type: (...) -> None if isinstance(cv, str) and cv != "prefit": raise ValueError("Invalid cv value: {!r}".format(cv)) @@ -149,6 +151,7 @@ def __init__(self, estimator, scoring=None, n_iter=5, random_state=None, self.n_iter = n_iter self.random_state = random_state self.cv = cv + self.n_jobs = n_jobs self.rng_ = check_random_state(random_state) def fit(self, X, y, groups=None, **fit_params): @@ -216,7 +219,7 @@ def _non_cv_scores_importances(self, X, y): def _get_score_importances(self, score_func, X, y): return get_score_importances(score_func, X, y, n_iter=self.n_iter, - random_state=self.rng_) + random_state=self.rng_, n_jobs=self.n_jobs) @property def caveats_(self): From 0d14a5995e5b36f9d5fcddda72f2ba6eaa7f62a3 Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 3 Nov 2017 15:12:05 +0800 Subject: [PATCH 4/7] add test --- tests/test_permutation_importance.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_permutation_importance.py b/tests/test_permutation_importance.py index effb4ff9..05567e0f 100644 --- a/tests/test_permutation_importance.py +++ b/tests/test_permutation_importance.py @@ -42,10 +42,11 @@ def is_shuffled(X, X_sh, col): def test_get_feature_importances(boston_train): X, y, feat_names = boston_train svr = SVR(C=20).fit(X, y) - score, importances = get_score_importances(svr.score, X, y) - assert score > 0.7 - importances = dict(zip(feat_names, np.mean(importances, axis=0))) - print(score) - print(importances) - assert importances['AGE'] > importances['NOX'] - assert importances['B'] > importances['CHAS'] + for n_jobs in [1, 2]: + score, importances = get_score_importances(svr.score, X, y, n_jobs=n_jobs) + assert score > 0.7 + importances = dict(zip(feat_names, np.mean(importances, axis=0))) + print(score) + print(importances) + assert importances['AGE'] > importances['NOX'] + assert importances['B'] > importances['CHAS'] From 91183d59835a88c842edb352be884b9e4bbd302d Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 10 Nov 2017 14:13:33 +0800 Subject: [PATCH 5/7] use multiprocess to serialize method and lambda in parallel jobs --- eli5/permutation_importance.py | 14 +++++++------- requirements.txt | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index 17d5c794..649bdfa6 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -15,8 +15,7 @@ import numpy as np # type: ignore from sklearn.utils import check_random_state # type: ignore -from sklearn.externals.joblib import Parallel, delayed - +from multiprocess import Pool def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, random_state=None): @@ -86,12 +85,13 @@ def get_score_importances( """ rng = check_random_state(random_state) base_score = score_func(X, y) - parallel = Parallel(n_jobs=n_jobs) seed0 = rng.randint(2**32) - result = parallel(delayed(_get_scores_shufled)( - score_func, X, y, columns_to_shuffle=columns_to_shuffle, - random_state=np.random.RandomState(seed0+i) - ) for i in range(n_iter)) + pool = Pool(n_jobs) + result = pool.map( + lambda seed: _get_scores_shufled(score_func, X, y, + columns_to_shuffle=columns_to_shuffle, + random_state=np.random.RandomState(seed)), + range(seed0, seed0+n_iter)) scores_decreases = [] for scores_shuffled in result: scores_decreases.append(-scores_shuffled + base_score) diff --git a/requirements.txt b/requirements.txt index ca97e5d2..b70181b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ attrs > 16.0.0 jinja2 pip >= 8.1 setuptools >= 20.7 +multiprocess From 0199ed25ab72baea0a9f5b0c548e4efac48e9203 Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Fri, 10 Nov 2017 14:22:50 +0800 Subject: [PATCH 6/7] add multiprocess to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index beb0a245..41397cf1 100755 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ def get_long_description(): 'typing', 'graphviz', 'tabulate>=0.7.7', + 'multiprocess', ], classifiers=[ 'Development Status :: 4 - Beta', From 4f9b53d29fac4e3633d7d30bd32b152d30895712 Mon Sep 17 00:00:00 2001 From: Qiang Luo Date: Thu, 21 Mar 2019 14:30:08 +0800 Subject: [PATCH 7/7] ignore type check for multiprocess --- eli5/permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eli5/permutation_importance.py b/eli5/permutation_importance.py index c28539af..b3cdad7d 100644 --- a/eli5/permutation_importance.py +++ b/eli5/permutation_importance.py @@ -15,7 +15,7 @@ import numpy as np # type: ignore from sklearn.utils import check_random_state # type: ignore -from multiprocess import Pool +from multiprocess import Pool # type: ignore def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, random_state=None):