- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 26.4k
[MRG+1] Fix LOF and Isolation benchmarks #9798
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
67a85ec
              5a0c557
              9825623
              16804a1
              809fbe0
              8696938
              aaf9e51
              3e06c31
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -3,6 +3,17 @@ | |
| IsolationForest benchmark | ||
| ========================================== | ||
| A test of IsolationForest on classical anomaly detection datasets. | ||
|  | ||
| The benchmark is run as follows: | ||
| 1. The dataset is randomly split into a training set and a test set, both | ||
| assumed to contain outliers. | ||
| 2. Isolation Forest is trained on the training set. | ||
| 3. The ROC curve is computed on the test set using the knowledge of the labels. | ||
|  | ||
| Note that the smtp dataset contains a very small proportion of outliers. | ||
| Therefore, depending on the seed of the random number generator, randomly | ||
| splitting the data set might lead to a test set containing no outliers. In this | ||
| case a warning is raised when computing the ROC curve. | ||
| """ | ||
|  | ||
| from time import time | ||
|  | @@ -12,7 +23,7 @@ | |
| from sklearn.ensemble import IsolationForest | ||
| from sklearn.metrics import roc_curve, auc | ||
| from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata | ||
| from sklearn.preprocessing import MultiLabelBinarizer | ||
| from sklearn.preprocessing import LabelBinarizer | ||
| from sklearn.utils import shuffle as sh | ||
|  | ||
| print(__doc__) | ||
|  | @@ -30,15 +41,14 @@ def print_outlier_ratio(y): | |
| print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) | ||
|  | ||
|  | ||
| np.random.seed(1) | ||
| SEED = 1 | ||
| fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) | ||
|  | ||
| # Set this to true for plotting score histograms for each dataset: | ||
| with_decision_function_histograms = False | ||
|  | ||
| # Removed the shuttle dataset because as of 2017-03-23 mldata.org is down: | ||
| # datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
| datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover'] | ||
| # datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that the shuttle dataset is run by default, we can remove this comment. | ||
| datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
|  | ||
| # Loop over all datasets for fitting and scoring the estimator: | ||
| for dat in datasets: | ||
|  | @@ -47,15 +57,16 @@ def print_outlier_ratio(y): | |
| print('====== %s ======' % dat) | ||
| print('--- Fetching data...') | ||
| if dat in ['http', 'smtp', 'SF', 'SA']: | ||
| dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) | ||
| dataset = fetch_kddcup99(subset=dat, shuffle=True, | ||
| percent10=True, random_state=SEED) | ||
| X = dataset.data | ||
| y = dataset.target | ||
|  | ||
| if dat == 'shuttle': | ||
| dataset = fetch_mldata('shuttle') | ||
| X = dataset.data | ||
| y = dataset.target | ||
| X, y = sh(X, y) | ||
| X, y = sh(X, y, random_state=SEED) | ||
| # we remove data with label 4 | ||
| # normal data are then those of class 1 | ||
| s = (y != 4) | ||
|  | @@ -65,7 +76,7 @@ def print_outlier_ratio(y): | |
| print('----- ') | ||
|  | ||
| if dat == 'forestcover': | ||
| dataset = fetch_covtype(shuffle=True) | ||
| dataset = fetch_covtype(shuffle=True, random_state=SEED) | ||
| X = dataset.data | ||
| y = dataset.target | ||
| # normal data are those with attribute 2 | ||
|  | @@ -79,17 +90,17 @@ def print_outlier_ratio(y): | |
| print('--- Vectorizing data...') | ||
|  | ||
| if dat == 'SF': | ||
| lb = MultiLabelBinarizer() | ||
| x1 = lb.fit_transform(X[:, 1]) | ||
| lb = LabelBinarizer() | ||
| x1 = lb.fit_transform(X[:, 1].astype(str)) | ||
| X = np.c_[X[:, :1], x1, X[:, 2:]] | ||
| y = (y != b'normal.').astype(int) | ||
| print_outlier_ratio(y) | ||
|  | ||
| if dat == 'SA': | ||
| lb = MultiLabelBinarizer() | ||
| x1 = lb.fit_transform(X[:, 1]) | ||
| x2 = lb.fit_transform(X[:, 2]) | ||
| x3 = lb.fit_transform(X[:, 3]) | ||
| lb = LabelBinarizer() | ||
| x1 = lb.fit_transform(X[:, 1].astype(str)) | ||
| x2 = lb.fit_transform(X[:, 2].astype(str)) | ||
| x3 = lb.fit_transform(X[:, 3].astype(str)) | ||
| X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] | ||
| y = (y != b'normal.').astype(int) | ||
| print_outlier_ratio(y) | ||
|  | @@ -108,7 +119,7 @@ def print_outlier_ratio(y): | |
| y_test = y[n_samples_train:] | ||
|  | ||
| print('--- Fitting the IsolationForest estimator...') | ||
| model = IsolationForest(n_jobs=-1) | ||
| model = IsolationForest(n_jobs=-1, random_state=SEED) | ||
| tstart = time() | ||
| model.fit(X_train) | ||
| fit_time = time() - tstart | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -5,6 +5,16 @@ | |
| A test of LocalOutlierFactor on classical anomaly detection datasets. | ||
| Note that LocalOutlierFactor is not meant to predict on a test set and its | ||
| performance is assessed in an outlier detection context: | ||
| 1. The model is trained on the whole dataset which is assumed to contain | ||
| outliers. | ||
| 2. The ROC curve is computed on the same dataset using the knowledge of the | ||
| labels. | ||
| In this context there is no need to shuffle the dataset because the model | ||
| is trained and tested on the whole dataset. The randomness of this benchmark | ||
| is only caused by the random selection of anomalies in the SA dataset. | ||
| """ | ||
|  | ||
| from time import time | ||
|  | @@ -14,31 +24,28 @@ | |
| from sklearn.metrics import roc_curve, auc | ||
| from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata | ||
| from sklearn.preprocessing import LabelBinarizer | ||
| from sklearn.utils import shuffle as sh | ||
|  | ||
| print(__doc__) | ||
|  | ||
| np.random.seed(2) | ||
| SEED = 2 # to control the random selection of anomalies in the SA dataset | ||
|          | ||
|  | ||
| # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
| datasets = ['shuttle'] | ||
|  | ||
| novelty_detection = True # if False, training set polluted by outliers | ||
| datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
|  | ||
| plt.figure() | ||
| for dataset_name in datasets: | ||
| # loading and vectorization | ||
| print('loading data') | ||
| if dataset_name in ['http', 'smtp', 'SA', 'SF']: | ||
| dataset = fetch_kddcup99(subset=dataset_name, shuffle=True, | ||
| percent10=False) | ||
| dataset = fetch_kddcup99(subset=dataset_name, percent10=True, | ||
| random_state=SEED) | ||
| X = dataset.data | ||
| y = dataset.target | ||
|  | ||
| if dataset_name == 'shuttle': | ||
| dataset = fetch_mldata('shuttle') | ||
| X = dataset.data | ||
| y = dataset.target | ||
| X, y = sh(X, y) | ||
| # we remove data with label 4 | ||
| # normal data are then those of class 1 | ||
| s = (y != 4) | ||
|  | @@ -47,7 +54,7 @@ | |
| y = (y != 1).astype(int) | ||
|  | ||
| if dataset_name == 'forestcover': | ||
| dataset = fetch_covtype(shuffle=True) | ||
| dataset = fetch_covtype() | ||
| X = dataset.data | ||
| y = dataset.target | ||
| # normal data are those with attribute 2 | ||
|  | @@ -61,54 +68,34 @@ | |
|  | ||
| if dataset_name == 'SF': | ||
| lb = LabelBinarizer() | ||
| lb.fit(X[:, 1]) | ||
| x1 = lb.transform(X[:, 1]) | ||
| x1 = lb.fit_transform(X[:, 1].astype(str)) | ||
| X = np.c_[X[:, :1], x1, X[:, 2:]] | ||
| y = (y != 'normal.').astype(int) | ||
| y = (y != b'normal.').astype(int) | ||
|  | ||
| if dataset_name == 'SA': | ||
| lb = LabelBinarizer() | ||
| lb.fit(X[:, 1]) | ||
| x1 = lb.transform(X[:, 1]) | ||
| lb.fit(X[:, 2]) | ||
| x2 = lb.transform(X[:, 2]) | ||
| lb.fit(X[:, 3]) | ||
| x3 = lb.transform(X[:, 3]) | ||
| x1 = lb.fit_transform(X[:, 1].astype(str)) | ||
| x2 = lb.fit_transform(X[:, 2].astype(str)) | ||
| x3 = lb.fit_transform(X[:, 3].astype(str)) | ||
| X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] | ||
| y = (y != 'normal.').astype(int) | ||
| y = (y != b'normal.').astype(int) | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. works with python2 and python3? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes | ||
|  | ||
| if dataset_name == 'http' or dataset_name == 'smtp': | ||
| y = (y != 'normal.').astype(int) | ||
|  | ||
| n_samples, n_features = np.shape(X) | ||
| n_samples_train = n_samples // 2 | ||
| n_samples_test = n_samples - n_samples_train | ||
| y = (y != b'normal.').astype(int) | ||
|  | ||
| X = X.astype(float) | ||
| X_train = X[:n_samples_train, :] | ||
| X_test = X[n_samples_train:, :] | ||
| y_train = y[:n_samples_train] | ||
| y_test = y[n_samples_train:] | ||
|  | ||
| if novelty_detection: | ||
| X_train = X_train[y_train == 0] | ||
| y_train = y_train[y_train == 0] | ||
|  | ||
| print('LocalOutlierFactor processing...') | ||
| model = LocalOutlierFactor(n_neighbors=20) | ||
| tstart = time() | ||
| model.fit(X_train) | ||
| model.fit(X) | ||
| fit_time = time() - tstart | ||
| tstart = time() | ||
|  | ||
| scoring = -model.decision_function(X_test) # the lower, the more normal | ||
| predict_time = time() - tstart | ||
| fpr, tpr, thresholds = roc_curve(y_test, scoring) | ||
| scoring = -model.negative_outlier_factor_ # the lower, the more normal | ||
| fpr, tpr, thresholds = roc_curve(y, scoring) | ||
| AUC = auc(fpr, tpr) | ||
| plt.plot(fpr, tpr, lw=1, | ||
| label=('ROC for %s (area = %0.3f, train-time: %0.2fs,' | ||
| 'test-time: %0.2fs)' % (dataset_name, AUC, fit_time, | ||
| predict_time))) | ||
| label=('ROC for %s (area = %0.3f, train-time: %0.2fs)' | ||
| % (dataset_name, AUC, fit_time))) | ||
|  | ||
| plt.xlim([-0.05, 1.05]) | ||
| plt.ylim([-0.05, 1.05]) | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm more used to see
rng = np.random.RandomState(0)but I think it doesn't matter.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't use a new name for something we refer to as random_state
use:
random_state = 1
and pass random_state=random_state in function calls.