diff --git a/flaml/automl.py b/flaml/automl.py index 2b7ff028f3..2e24c57dd4 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -589,6 +589,12 @@ def _prepare_data(self, self._state.y_val = (X_train, y_train, X_val, y_val) if self._split_type == "stratified": logger.info("Using StratifiedKFold") + assert y_train_all.size >= n_splits, ( + f"{n_splits}-fold cross validation" + f" requires input data with at least {n_splits} examples.") + assert y_train_all.size >= 2*n_splits, ( + f"{n_splits}-fold cross validation with metric=r2 " + f"requires input data with at least {n_splits*2} examples.") self._state.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED) else: @@ -1045,7 +1051,7 @@ def _search(self): init_config=None, search_alg=search_state.search_alg, time_budget_s=budget_left, - verbose=max(self.verbose-1,0), local_dir='logs/tune_results', + verbose=max(self.verbose-1,0), #local_dir='logs/tune_results', use_ray=False, ) # warnings.resetwarnings() diff --git a/flaml/data.py b/flaml/data.py index 18c29e0962..313fa27e03 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -192,11 +192,13 @@ def fit_transform(self, X, y, task): X = X.copy() n = X.shape[0] cat_columns, num_columns = [], [] + drop = False for column in X.columns: if X[column].dtype.name in ('object', 'category'): if X[column].nunique() == 1 or X[column].nunique( dropna=True) == n - X[column].isnull().sum(): X.drop(columns=column, inplace=True) + drop = True elif X[column].dtype.name == 'category': current_categories = X[column].cat.categories if '__NAN__' not in current_categories: @@ -204,27 +206,33 @@ def fit_transform(self, X, y, task): '__NAN__').fillna('__NAN__') cat_columns.append(column) else: - X[column].fillna('__NAN__', inplace=True) + X[column] = X[column].fillna('__NAN__') cat_columns.append(column) else: # print(X[column].dtype.name) if X[column].nunique(dropna=True) < 2: X.drop(columns=column, inplace=True) + drop = True else: - X[column].fillna(np.nan, inplace=True) + X[column] = X[column].fillna(np.nan) num_columns.append(column) X = X[cat_columns + num_columns] if cat_columns: X[cat_columns] = X[cat_columns].astype('category') if num_columns: + X_num = X[num_columns] + if drop and np.issubdtype(X_num.columns.dtype, np.integer): + X_num.columns = range(X_num.shape[1]) + else: drop = False from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer self.transformer = ColumnTransformer([( 'continuous', SimpleImputer(missing_values=np.nan, strategy='median'), - num_columns)]) - X[num_columns] = self.transformer.fit_transform(X) + X_num.columns)]) + X[num_columns] = self.transformer.fit_transform(X_num) self._cat_columns, self._num_columns = cat_columns, num_columns + self._drop = drop if task == 'regression': self.label_transformer = None @@ -241,7 +249,7 @@ def transform(self, X): for column in cat_columns: # print(column, X[column].dtype.name) if X[column].dtype.name == 'object': - X[column].fillna('__NAN__', inplace=True) + X[column] = X[column].fillna('__NAN__') elif X[column].dtype.name == 'category': current_categories = X[column].cat.categories if '__NAN__' not in current_categories: @@ -250,6 +258,8 @@ def transform(self, X): if cat_columns: X[cat_columns] = X[cat_columns].astype('category') if num_columns: - X[num_columns].fillna(np.nan, inplace=True) - X[num_columns] = self.transformer.transform(X) + X_num = X[num_columns].fillna(np.nan) + if self._drop: + X_num.columns = range(X_num.shape[1]) + X[num_columns] = self.transformer.transform(X_num) return X diff --git a/flaml/model.py b/flaml/model.py index dcd13870be..0ddb16fddf 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -239,11 +239,8 @@ def __init__(self, task='binary:logistic', n_jobs=1, else: objective = 'regression' self.params = { "n_estimators": int(round(n_estimators)), - "num_leaves": params[ - 'num_leaves'] if 'num_leaves' in params else int( - round(max_leaves)), - 'objective': params[ - "objective"] if "objective" in params else objective, + "num_leaves": params.get('num_leaves', int(round(max_leaves))), + 'objective': params.get("objective", objective), 'n_jobs': n_jobs, 'learning_rate': float(learning_rate), 'reg_alpha': float(reg_alpha), @@ -359,18 +356,17 @@ def __init__(self, task='regression', all_thread=False, n_jobs=1, self._max_leaves = int(round(max_leaves)) self.params = { 'max_leaves': int(round(max_leaves)), - 'max_depth': 0, - 'grow_policy': params[ - "grow_policy"] if "grow_policy" in params else 'lossguide', - 'tree_method':tree_method, - 'verbosity': 0, - 'nthread':n_jobs, + 'max_depth': params.get('max_depth', 0), + 'grow_policy': params.get("grow_policy", 'lossguide'), + 'tree_method': tree_method, + 'verbosity': params.get('verbosity', 0), + 'nthread': n_jobs, 'learning_rate': float(learning_rate), 'subsample': float(subsample), 'reg_alpha': float(reg_alpha), 'reg_lambda': float(reg_lambda), 'min_child_weight': float(min_child_weight), - 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'booster': params.get('booster', 'gbtree'), 'colsample_bylevel': float(colsample_bylevel), 'colsample_bytree':float(colsample_bytree), } @@ -429,9 +425,8 @@ def __init__(self, task='binary:logistic', n_jobs=1, "n_estimators": int(round(n_estimators)), 'max_leaves': int(round(max_leaves)), 'max_depth': 0, - 'grow_policy': params[ - "grow_policy"] if "grow_policy" in params else 'lossguide', - 'tree_method':tree_method, + 'grow_policy': params.get("grow_policy", 'lossguide'), + 'tree_method': tree_method, 'verbosity': 0, 'n_jobs': n_jobs, 'learning_rate': float(learning_rate), @@ -439,7 +434,7 @@ def __init__(self, task='binary:logistic', n_jobs=1, 'reg_alpha': float(reg_alpha), 'reg_lambda': float(reg_lambda), 'min_child_weight': float(min_child_weight), - 'booster': params['booster'] if 'booster' in params else 'gbtree', + 'booster': params.get('booster', 'gbtree'), 'colsample_bylevel': float(colsample_bylevel), 'colsample_bytree': float(colsample_bytree), } @@ -544,10 +539,10 @@ def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0, **params): super().__init__(task, **params) self.params = { - 'penalty': 'l1', + 'penalty': params.get("penalty", 'l1'), 'tol': float(tol), 'C': float(C), - 'solver': 'saga', + 'solver': params.get("solver", 'saga'), 'n_jobs': n_jobs, } if 'regression' in task: @@ -573,10 +568,10 @@ def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0, **params): super().__init__(task, **params) self.params = { - 'penalty': 'l2', + 'penalty': params.get("penalty", 'l2'), 'tol': float(tol), 'C': float(C), - 'solver': 'lbfgs', + 'solver': params.get("solver", 'lbfgs'), 'n_jobs': n_jobs, } if 'regression' in task: @@ -625,9 +620,8 @@ def __init__(self, task = 'binary:logistic', n_jobs=1, "n_estimators": n_estimators, 'learning_rate': learning_rate, 'thread_count': n_jobs, - 'verbose': False, - 'random_seed': params[ - "random_seed"] if "random_seed" in params else 10242048, + 'verbose': params.get('verbose', False), + 'random_seed': params.get("random_seed", 10242048), } if 'regression' in task: from catboost import CatBoostRegressor @@ -724,7 +718,7 @@ def __init__(self, task='binary:logistic', n_jobs=1, super().__init__(task, **params) self.params= { 'n_neighbors': int(round(n_neighbors)), - 'weights': 'distance', + 'weights': params.get('weights', 'distance'), 'n_jobs': n_jobs, } if 'regression' in task: diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py index ad8bebf0a7..ae00d741c0 100644 --- a/flaml/tune/tune.py +++ b/flaml/tune/tune.py @@ -205,7 +205,7 @@ def compute_with_config(config): 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 2. local_dir: A string of the local dir to save ray logs if ray backend is - used. + used; or a local dir to save the tuning log. num_samples: An integer of the number of configs to try. Defaults to 1. resources_per_trial: A dictionary of the hardware resources to allocate per trial, e.g., `{'mem': 1024**3}`. When not using ray backend, @@ -221,9 +221,18 @@ def compute_with_config(config): _verbose = verbose if verbose > 0: import os - os.makedirs(local_dir, exist_ok=True) - logger.addHandler(logging.FileHandler(local_dir+'/tune_'+str( - datetime.datetime.now()).replace(':', '-')+'.log')) + if local_dir: + os.makedirs(local_dir, exist_ok=True) + logger.addHandler(logging.FileHandler(local_dir+'/tune_'+str( + datetime.datetime.now()).replace(':', '-')+'.log')) + elif not logger.handlers: + # Add the console handler. + _ch = logging.StreamHandler() + logger_formatter = logging.Formatter( + '[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s', + '%m-%d %H:%M:%S') + _ch.setFormatter(logger_formatter) + logger.addHandler(_ch) if verbose<=2: logger.setLevel(logging.INFO) else: diff --git a/test/nni/flaml_nni_wrap.py b/test/nni/flaml_nni_wrap.py index 60265f87d8..67a6758478 100644 --- a/test/nni/flaml_nni_wrap.py +++ b/test/nni/flaml_nni_wrap.py @@ -1,5 +1,7 @@ -from flaml.searcher.blendsearch import BlendSearchTuner as BST, BlendSearch +from flaml.searcher.blendsearch import BlendSearchTuner as BST + + class BlendSearchTuner(BST): # for best performance pass low cost initial parameters here def __init__(self, points_to_evaluate=[{"hidden_size":128}]): - super.__init__(self,points_to_evaluate=points_to_evaluate) + super.__init__(self, points_to_evaluate=points_to_evaluate) diff --git a/test/test_automl.py b/test/test_automl.py index 247adcccf0..c8b2960e55 100644 --- a/test/test_automl.py +++ b/test/test_automl.py @@ -172,6 +172,10 @@ def test_classification(self, as_frame=False): "model_history": True } X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame) + if as_frame: + # test drop column + X_train.columns = range(X_train.shape[1]) + X_train[X_train.shape[1]] = np.zeros(len(y_train)) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.classes_) @@ -252,7 +256,8 @@ def test_sparse_matrix_regression(self): "task": 'regression', "log_file_name": "test/sparse_regression.log", "n_jobs": 1, - "model_history": True + "model_history": True, + "verbose": 0, } X_train = scipy.sparse.random(300, 900, density=0.0001) y_train = np.random.uniform(size=300) @@ -327,10 +332,11 @@ def test_sparse_matrix_regression_cv(self): "task": 'regression', "log_file_name": "test/sparse_regression.log", "n_jobs": 1, - "model_history": True + "model_history": True, + "metric": "mse" } - X_train = scipy.sparse.random(100, 100) - y_train = np.random.uniform(size=100) + X_train = scipy.sparse.random(8, 100) + y_train = np.random.uniform(size=8) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train)) diff --git a/test/test_training_log.py b/test/test_training_log.py index 18f897b6ad..c82013c30d 100644 --- a/test/test_training_log.py +++ b/test/test_training_log.py @@ -25,7 +25,8 @@ def test_training_log(self): "log_training_metric": True, "mem_thres": 1024*1024, "n_jobs": 1, - "model_history": True + "model_history": True, + "verbose": 2, } X_train, y_train = load_boston(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train,