diff --git a/Chapter2-DataManipulation/2.10_dimensionality_reduction.html b/Chapter2-DataManipulation/2.10_dimensionality_reduction.html index 8a740a1..56fe74d 100644 --- a/Chapter2-DataManipulation/2.10_dimensionality_reduction.html +++ b/Chapter2-DataManipulation/2.10_dimensionality_reduction.html @@ -908,7 +908,7 @@
<matplotlib.legend.Legend at 0x7f5a372da3e0>
+<matplotlib.legend.Legend at 0x7f60f04a8fa0>
@@ -989,8 +989,16 @@ Step one: subtract the mean## Remove the mean of the data
Xavg = np.mean(X, axis=1) # Compute mean
B = X - np.tile(Xavg,(nPoints,1)).T # Mean-subtracted data
+
+plt.scatter(B[0,:],B[1,:], color='k', alpha=0.125)
+
+
<matplotlib.collections.PathCollection at 0x7f60ecda71c0>
[[ 0.42469643 0.747179 0.85328813 ... 0.88246352 -1.884191
- -1.32403592]
- [-0.18730514 0.80047482 1.11341706 ... -0.09048239 -2.95497644
- -0.18146037]]
-[[ 0.42469643 -0.18730514]
- [ 0.747179 0.80047482]
- [ 0.85328813 1.11341706]
- ...
- [ 0.88246352 -0.09048239]
- [-1.884191 -2.95497644]
- [-1.32403592 -0.18146037]]
-
<matplotlib.legend.Legend at 0x7f1890190820>
+<matplotlib.legend.Legend at 0x7f60ec163730>
@@ -1304,7 +1297,7 @@ 2.10.3 PCA on 3D data.
-
SVD can be computationally intensive for larger dimensions.
diff --git a/Chapter3-MachineLearning/3.3_binary_classification.html b/Chapter3-MachineLearning/3.3_binary_classification.html index 7092d9b..7070101 100644 --- a/Chapter3-MachineLearning/3.3_binary_classification.html +++ b/Chapter3-MachineLearning/3.3_binary_classification.html @@ -747,16 +747,10 @@---------------------------------------------------------------------------
-ImportError Traceback (most recent call last)
-Cell In [7], line 1
-----> 1 from sklearn.inspection import DecisionBoundaryDisplay
- 2 ax = plt.subplot()
- 3 # plot the decision boundary as a background
-
-ImportError: cannot import name 'DecisionBoundaryDisplay' from 'sklearn.inspection' (/Users/marinedenolle/opt/miniconda3/envs/mlgeo_sk/lib/python3.9/site-packages/sklearn/inspection/__init__.py)
+<matplotlib.collections.PathCollection at 0x7f7c5ef8fdf0>
+
The results shows a not-too bad classification, but a low confidence.
@@ -779,11 +773,6 @@The mean accuracy on the given test and labels is 0.975000
<matplotlib.collections.PathCollection at 0x151ada520>
+<matplotlib.collections.PathCollection at 0x7f7c5ad06230>
Now we will test the effect of data normalization before the classification. We will stretch the first axis of the data to see the effects.
-# make a data set
-X, y = make_moons(noise=0.3, random_state=0)
-X[:,0] = 10*X[:,0]
-
# define ML
-K = 5
-clf= KNeighborsClassifier(K)
-
-# normalize data.
-# X = StandardScaler().fit_transform(X)
-
-# split data between train and test set.
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
-
-# Fit the model.
-clf.fit(X_train, y_train)
-
-# calculate the mean accuracy on the given test data and labels.
-score = clf.score(X_test, y_test)
-print("The mean accuracy on the given test and labels is %f" %score)
-
-# plot the decision boundary as a background
+# plot the decision boundary as a background
ax = plt.subplot()
-# DecisionBoundaryDisplay.from_estimator(clf, X, cmap='PiYG', alpha=0.8, ax=ax, eps=0.5)
+DecisionBoundaryDisplay.from_estimator(clf, X, cmap='PiYG', alpha=0.8, ax=ax, eps=0.5)
ax.scatter(X[:, 0], X[:, 1], c=y, cmap='PiYG', alpha=0.6, edgecolors="k")
The mean accuracy on the given test and labels is 0.775000
+<matplotlib.collections.PathCollection at 0x7f7c5ab6c760>
-<matplotlib.collections.PathCollection at 0x151ada670>
+
+
+
+Now we will test to see what happens when you do not normalize your data before the classification. We will stretch the first axis of the data to see the effects.
+
+
+# make a data set
+X, y = make_moons(noise=0.3, random_state=0)
+X[:,0] = 10*X[:,0]
-
-This drastically reduces the performance.
# define ML
K = 5
clf= KNeighborsClassifier(K)
-# normalize data.
-X = StandardScaler().fit_transform(X)
-
# split data between train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
@@ -871,19 +836,20 @@ 1.1 Synthetic Data
-The mean accuracy on the given test and labels is 0.975000
+The mean accuracy on the given test and labels is 0.775000
-<matplotlib.collections.PathCollection at 0x151aa9100>
+<matplotlib.collections.PathCollection at 0x7f7c5ace6590>
-
+
+This drastically reduces the performance.
2. Classifier Performance Metrics#
-In a binary classifier, we label one of the two classes as positive, the other class is negative. Let’s consider N data samples.
+In a binary classifier, we label one of the two classes as positive, the other class as negative. Let’s consider N data samples.