Skip to content

Commit

Permalink
Multiclass (#99)
Browse files Browse the repository at this point in the history
* utility functions

* stepsize lower bound
  • Loading branch information
sonichi authored Jun 4, 2021
1 parent 0d3a0bf commit f7cf2ea
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 25 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ For more technical details, please check our papers.
```
* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.

* ChaCha for online AutoML. Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. To appear in ICML 2021.

## Contributing
Expand Down
3 changes: 2 additions & 1 deletion flaml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
try:
from flaml.onlineml.autovw import AutoVW
except ImportError:
print('need to install vowpalwabbit to use AutoVW')
# print('need to install vowpalwabbit to use AutoVW')
pass
from flaml.version import __version__
import logging

Expand Down
11 changes: 7 additions & 4 deletions flaml/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,14 +809,17 @@ def fit(self,
dataframe and label are ignored;
If not, dataframe and label must be provided.
metric: A string of the metric name or a function,
e.g., 'accuracy', 'roc_auc', 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
e.g., 'accuracy', 'roc_auc', 'f1', 'micro_f1', 'macro_f1',
'log_loss', 'mae', 'mse', 'r2'
if passing a customized metric function, the function needs to
have the follwing signature:
.. code-block:: python
def custom_metric(X_test, y_test, estimator, labels,
X_train, y_train, weight_test=None, weight_train=None):
def custom_metric(
X_test, y_test, estimator, labels,
X_train, y_train, weight_test=None, weight_train=None
):
return metric_to_minimize, metrics_to_log
which returns a float number as the minimization objective,
Expand Down Expand Up @@ -1238,7 +1241,7 @@ def _select_estimator(self, estimator_list):
for i, estimator in enumerate(estimator_list):
if estimator in self._search_states and (
self._search_states[estimator].sample_size
): # sample_size=none meaning no result
): # sample_size=None meaning no result
search_state = self._search_states[estimator]
if (self._search_states[estimator].time2eval_best
> self._state.time_budget - self._state.time_from_start
Expand Down
42 changes: 42 additions & 0 deletions flaml/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,45 @@ def get_classification_objective(num_labels: int) -> str:
else:
objective_name = 'multi:softmax'
return objective_name


def norm_confusion_matrix(y_true, y_pred):
'''normalized confusion matrix
Args:
estimator: A multi-class classification estimator
y_true: A numpy array or a pandas series of true labels
y_pred: A numpy array or a pandas series of predicted labels
Returns:
A normalized confusion matrix
'''
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true, y_pred)
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
return norm_conf_mat


def multi_class_curves(y_true, y_pred_proba, curve_func):
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves
Args:
y_true: A numpy array or a pandas series of true labels
y_pred_proba: A numpy array or a pandas dataframe of predicted probabilites
curve_func: A function to produce a curve (e.g., roc_curve or precision_recall_curve)
Returns:
A tuple of two dictionaries with the same set of keys (class indices)
The first dictionary curve_x stores the x coordinates of each curve, e.g.,
curve_x[0] is an 1D array of the x coordinates of class 0
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
curve_y[0] is an 1D array of the y coordinates of class 0
'''
from sklearn.preprocessing import label_binarize
classes = np.unique(y_true)
y_true_binary = label_binarize(y_true, classes=classes)

curve_x, curve_y = {}, {}
for i in range(len(classes)):
curve_x[i], curve_y[i], _ = curve_func(y_true_binary[:, i], y_pred_proba[:, i])
return curve_x, curve_y
1 change: 1 addition & 0 deletions flaml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ def __init__(
'booster': params.get('booster', 'gbtree'),
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
'use_label_encoder': params.get('use_label_encoder', False),
}

if 'regression' in task:
Expand Down
2 changes: 1 addition & 1 deletion flaml/onlineml/autovw.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import numpy as np
from typing import Optional, Union
import logging
from flaml.tune import Trial, Categorical, Float, PolynomialExpansionSet, polynomial_expansion_set
from flaml.onlineml import OnlineTrialRunner
from flaml.scheduler import ChaChaScheduler
from flaml.searcher import ChampionFrontierSearcher
from flaml.onlineml.trial import get_ns_feature_dim_from_vw_example

logger = logging.getLogger(__name__)


Expand Down
5 changes: 3 additions & 2 deletions flaml/onlineml/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import math
import copy
import collections
from typing import Dict, Optional
from typing import Optional
from sklearn.metrics import mean_squared_error, mean_absolute_error
from vowpalwabbit import pyvw
from flaml.tune import Trial

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -270,6 +270,7 @@ class VowpalWabbitTrial(BaseOnlineTrial):
- Namespace vs features:
https://stackoverflow.com/questions/28586225/in-vowpal-wabbit-what-is-the-difference-between-a-namespace-and-feature
"""
from vowpalwabbit import pyvw
MODEL_CLASS = pyvw.vw
cost_unit = 1.0
interactions_config_key = 'interactions'
Expand Down
5 changes: 1 addition & 4 deletions flaml/searcher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from .blendsearch import CFO, BlendSearch, BlendSearchTuner
from .flow2 import FLOW2
try:
from .online_searcher import ChampionFrontierSearcher
except ImportError:
print('need to install vowpalwabbit to use ChampionFrontierSearcher')
from .online_searcher import ChampionFrontierSearcher
6 changes: 3 additions & 3 deletions flaml/searcher/blendsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ def __init__(self,
init_config = low_cost_partial_config or {}
if not init_config:
logger.warning(
"No low-cost init config given to the search algorithm."
"No low-cost partial config given to the search algorithm. "
"For cost-frugal search, "
"consider providing init values for cost-related hps via "
"'init_config'."
"consider providing low-cost values for cost-related hps via "
"'low_cost_partial_config'."
)
self._points_to_evaluate = points_to_evaluate or []
self._config_constraints = config_constraints
Expand Down
21 changes: 12 additions & 9 deletions flaml/searcher/flow2.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,16 @@ def _init_search(self):
if callable(getattr(domain, 'get_sampler', None)):
self._tunable_keys.append(key)
sampler = domain.get_sampler()
# if isinstance(sampler, sample.Quantized):
# sampler_inner = sampler.get_sampler()
# if str(sampler_inner) == 'Uniform':
# self._step_lb = min(
# self._step_lb, sampler.q/(domain.upper-domain.lower))
# elif isinstance(domain, sample.Integer) and str(
# sampler) == 'Uniform':
# self._step_lb = min(
# self._step_lb, 1.0/(domain.upper-domain.lower))
# the step size lower bound for uniform variables doesn't depend
# on the current config
if isinstance(sampler, sample.Quantized):
sampler_inner = sampler.get_sampler()
if str(sampler_inner) == 'Uniform':
self._step_lb = min(
self._step_lb, sampler.q / (domain.upper - domain.lower))
elif isinstance(domain, sample.Integer) and str(sampler) == 'Uniform':
self._step_lb = min(
self._step_lb, 1.0 / (domain.upper - domain.lower))
if isinstance(domain, sample.Categorical):
cat_hp_cost = self.cat_hp_cost
if cat_hp_cost and key in cat_hp_cost:
Expand Down Expand Up @@ -199,6 +200,8 @@ def step_lower_bound(self) -> float:
continue
domain = self.space[key]
sampler = domain.get_sampler()
# the stepsize lower bound for log uniform variables depends on the
# current config
if isinstance(sampler, sample.Quantized):
sampler_inner = sampler.get_sampler()
if str(sampler_inner) == 'LogUniform':
Expand Down
8 changes: 8 additions & 0 deletions test/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ def test_micro_macro_f1(self):
X_train=X_train, y_train=y_train, metric='micro_f1', **automl_settings)
automl_experiment_macro.fit(
X_train=X_train, y_train=y_train, metric='macro_f1', **automl_settings)
estimator = automl_experiment_macro.model
y_pred = estimator.predict(X_train)
y_pred_proba = estimator.predict_proba(X_train)
from flaml.ml import norm_confusion_matrix, multi_class_curves
print(norm_confusion_matrix(y_train, y_pred))
from sklearn.metrics import roc_curve, precision_recall_curve
print(multi_class_curves(y_train, y_pred_proba, roc_curve))
print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))

def test_regression(self):
automl_experiment = AutoML()
Expand Down

0 comments on commit f7cf2ea

Please sign in to comment.