Skip to content

Commit

Permalink
add sklearn 0.18 layer to scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Sep 30, 2016
1 parent e06b7e5 commit f779b3d
Show file tree
Hide file tree
Showing 11 changed files with 179 additions and 58 deletions.
Binary file not shown.
Binary file not shown.
Binary file added code/ch09/movieclassifier/reviews.sqlite
Binary file not shown.
24 changes: 24 additions & 0 deletions code/ch09/movieclassifier/vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
os.path.join(cur_dir,
'pkl_objects',
'stopwords.pkl'), 'rb'))

def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text.lower())
text = re.sub('[\W]+', ' ', text.lower()) \
+ ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized

vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
12 changes: 10 additions & 2 deletions code/optional-py-scripts/ch03.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
Expand All @@ -25,6 +24,14 @@
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

# for sklearn 0.18's alternative syntax
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import train_test_split
else:
from sklearn.model_selection import train_test_split

#############################################################################
print(50 * '=')
print('Section: First steps with scikit-learn')
Expand Down Expand Up @@ -191,7 +198,8 @@ def cost_0(z):
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

print('Predicted probabilities', lr.predict_proba(X_test_std[0, :]))
print('Predicted probabilities', lr.predict_proba(X_test_std[0, :]
.reshape(1, -1)))

#############################################################################
print(50 * '=')
Expand Down
16 changes: 14 additions & 2 deletions code/optional-py-scripts/ch04.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
Expand All @@ -27,6 +26,13 @@
from itertools import combinations
import matplotlib.pyplot as plt

# for sklearn 0.18's alternative syntax
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import train_test_split
else:
from sklearn.model_selection import train_test_split

#############################################################################
print(50 * '=')
Expand Down Expand Up @@ -382,5 +388,11 @@ def _calc_score(self, X_train, y_train, X_test, y_test, indices):
# plt.savefig('./random_forest.png', dpi=300)
plt.show()

X_selected = forest.transform(X_train, threshold=0.15)
if Version(sklearn_version) < '0.18':
X_selected = forest.transform(X_train, threshold=0.15)
else:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(forest, threshold=0.15, prefit=True)
X_selected = sfm.transform(X_train)

X_selected.shape
12 changes: 11 additions & 1 deletion code/optional-py-scripts/ch05.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
Expand All @@ -27,6 +26,17 @@
from scipy.linalg import eigh
from matplotlib.ticker import FormatStrFormatter

# for sklearn 0.18's alternative syntax
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import train_test_split
from sklearn.lda import LDA
else:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


#############################################################################
print(50 * '=')
print('Section: Unsupervised dimensionality reduction'
Expand Down
83 changes: 55 additions & 28 deletions code/optional-py-scripts/ch06.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,10 @@
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.learning_curve import learning_curve
from sklearn.learning_curve import validation_curve
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
Expand All @@ -38,6 +32,24 @@
from sklearn.metrics import accuracy_score
from scipy import interp

# for sklearn 0.18's alternative syntax
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.learning_curve import learning_curve
from sklearn.learning_curve import validation_curve
from sklearn.grid_search import GridSearchCV
else:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV

#############################################################################
print(50 * '=')
print('Section: Loading the Breast Cancer Wisconsin dataset')
Expand Down Expand Up @@ -83,31 +95,39 @@
print('Section: K-fold cross-validation')
print(50 * '-')

kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)
if Version(sklearn_version) < '0.18':
kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)
else:
kfold = StratifiedKFold(n_splits=10,
random_state=1).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfold):
pipe_lr.fit(X_train[train], y_train[train])
score = pipe_lr.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1,
np.bincount(y_train[train]), score))

print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

print('Using StratifiedKFold')
kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)
if Version(sklearn_version) < '0.18':
kfold = StratifiedKFold(y=y_train,
n_folds=10,
random_state=1)
else:
kfold = StratifiedKFold(n_splits=10,
random_state=1).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfold):
pipe_lr.fit(X_train[train], y_train[train])
score = pipe_lr.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1,
np.bincount(y_train[train]), score))

print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
Expand All @@ -134,12 +154,12 @@
('clf', LogisticRegression(penalty='l2', random_state=0))])

train_sizes, train_scores, test_scores =\
learning_curve(estimator=pipe_lr,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)
learning_curve(estimator=pipe_lr,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
Expand Down Expand Up @@ -182,12 +202,12 @@

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
estimator=pipe_lr,
X=X_train,
y=y_train,
param_name='clf__C',
param_range=param_range,
cv=10)
estimator=pipe_lr,
X=X_train,
y=y_train,
param_name='clf__C',
param_range=param_range,
cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
Expand Down Expand Up @@ -345,7 +365,14 @@

X_train2 = X_train[:, [4, 14]]

cv = StratifiedKFold(y_train, n_folds=3, random_state=1)
if Version(sklearn_version) < '0.18':
cv = StratifiedKFold(y_train,
n_folds=3,
random_state=1)

else:
cv = list(StratifiedKFold(n_splits=3,
random_state=1).split(X_train, y_train))

fig = plt.figure(figsize=(7, 5))

Expand All @@ -367,7 +394,7 @@
tpr,
lw=1,
label='ROC fold %d (area = %0.2f)'
% (i+1, roc_auc))
% (i + 1, roc_auc))

plt.plot([0, 1],
[0, 1],
Expand Down
54 changes: 36 additions & 18 deletions code/optional-py-scripts/ch07.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,30 @@
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from itertools import product

# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import GridSearchCV
else:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#############################################################################
print(50 * '=')
Expand All @@ -48,7 +56,7 @@

def ensemble_error(n_classifier, error):
k_start = math.ceil(n_classifier / 2.0)
probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
probs = [comb(n_classifier, k) * error**k * (1 - error)**(n_classifier - k)
for k in range(k_start, n_classifier + 1)]
return sum(probs)

Expand Down Expand Up @@ -185,11 +193,11 @@ def predict(self, X):
for clf in self.classifiers_]).T

maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj_vote = self.lablenc_.inverse_transform(maj_vote)
return maj_vote

Expand Down Expand Up @@ -237,9 +245,9 @@ def get_params(self, deep=True):
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
train_test_split(X, y,
test_size=0.5,
random_state=1)
train_test_split(X, y,
test_size=0.5,
random_state=1)

clf1 = LogisticRegression(penalty='l2',
C=0.001,
Expand Down Expand Up @@ -391,9 +399,19 @@ def get_params(self, deep=True):
scoring='roc_auc')
grid.fit(X_train, y_train)

for params, mean_score, scores in grid.grid_scores_:
print("%0.3f+/-%0.2f %r"
% (mean_score, scores.std() / 2.0, params))
if Version(sklearn_version) < '0.18':
for params, mean_score, scores in grid.grid_scores_:
print("%0.3f +/- %0.2f %r"
% (mean_score, scores.std() / 2.0, params))

else:
cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
print("%0.3f +/- %0.2f %r"
% (grid.cv_results_[cv_keys[0]][r],
grid.cv_results_[cv_keys[1]][r] / 2.0,
grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
Expand Down Expand Up @@ -426,9 +444,9 @@ def get_params(self, deep=True):
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
train_test_split(X, y,
test_size=0.40,
random_state=1)
train_test_split(X, y,
test_size=0.40,
random_state=1)

tree = DecisionTreeClassifier(criterion='entropy',
max_depth=None,
Expand Down
Loading

0 comments on commit f779b3d

Please sign in to comment.