deploy: 094ff6e

mobook · Nov 20, 2023 · b363dc8 · b363dc8
1 parent eef1aee
commit b363dc8
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 356 deletions.
diff --git a/_sources/notebooks/05/svm.ipynb b/_sources/notebooks/05/svm.ipynb
@@ -32,9 +32,9 @@
     "import sys, os\n",
     "\n",
     "if 'google.colab' in sys.modules:\n",
-    "    !pip install idaes-pse --pre >/dev/null 2>/dev/null\n",
-    "    !pip install highspy >/dev/null 2>/dev/null\n",
-    "    !idaes get-extensions --to ./bin \n",
+    "    %pip install idaes-pse --pre >/dev/null 2>/dev/null\n",
+    "    %pip install highspy >/dev/null 2>/dev/null\n",
+    "    %idaes get-extensions --to ./bin \n",
     "    os.environ['PATH'] += ':bin'\n",
     "    solver_NLO = \"ipopt\"\n",
     "else:\n",
@@ -82,9 +82,7 @@
    "source": [
     "## The data set \n",
     "\n",
-    "The following data set contains measurements from a collection of known genuine and known counterfeit banknote specimens. The data includes four continuous statistical measures obtained from the wavelet transform of banknote images named \"variance\", \"skewness\", \"curtosis\", and \"entropy\", and a binary variable named \"class\" which is 0 if genuine and 1 if counterfeit.\n",
-    "\n",
-    "https://archive.ics.uci.edu/ml/datasets/banknote+authentication"
+    "The following data set contains measurements from a collection of known genuine and known counterfeit banknote specimens. The data is taken from https://archive.ics.uci.edu/ml/datasets/banknote+authentication and includes four continuous statistical measures obtained from the wavelet transform of banknote images named \"variance\", \"skewness\", \"curtosis\", and \"entropy\", and a binary variable named \"class\" which is 0 if genuine and 1 if counterfeit."
    ]
   },
   {
@@ -194,18 +192,22 @@
     "import pandas as pd\n",
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "# read data set\n",
     "df = pd.read_csv(\n",
     "    \"https://raw.githubusercontent.com/mobook/MO-book/main/datasets/data_banknote_authentication.txt\",\n",
     "    header=None,\n",
     ")\n",
     "df.columns = [\"variance\", \"skewness\", \"curtosis\", \"entropy\", \"class\"]\n",
     "df.name = \"Banknotes\"\n",
-    "\n",
-    "# show a few rows\n",
     "df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using built-in `pandas` functionalities, we can get a quick overview of the data set."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -327,7 +329,6 @@
     }
    ],
    "source": [
-    "# get a statistical description of the data set\n",
     "df.describe()"
    ]
   },
@@ -379,25 +380,6 @@
    "outputs": [],
    "source": [
     "def scatter_labeled_data(X, y, labels=[\"+1\", \"-1\"], colors=[\"g\", \"r\"], **kwargs):\n",
-    "    \"\"\"\n",
-    "    Creates a scatter plot for labeled data with default labels and colors.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Target vector as a Series.\n",
-    "    labels : list, optional\n",
-    "        Labels for the positive and negative classes. Default is [\"+1\", \"-1\"].\n",
-    "    colors : list, optional\n",
-    "        Colors for the positive and negative classes. Default is [\"g\", \"r\"].\n",
-    "    **kwargs : dict\n",
-    "        Additional keyword arguments for the scatter plot.\n",
-    "\n",
-    "    Returns:\n",
-    "    None\n",
-    "    \"\"\"\n",
-    "\n",
     "    # Prepend keyword arguments for all scatter plots\n",
     "    kw = {\"x\": 0, \"y\": 1, \"kind\": \"scatter\", \"alpha\": 0.4}\n",
     "    kw.update(kwargs)\n",
@@ -482,33 +464,17 @@
    "source": [
     "# Linear Support Vector Machine (SVM) class\n",
     "class LinearSVM:\n",
-    "    # Initialize the Linear SVM with weights and bias\n",
+    "    # Initialize the Linear SVM with weights w and bias b\n",
     "    def __init__(self, w, b):\n",
-    "        \"\"\"\n",
-    "        Args:\n",
-    "            w (Pandas Series or dictionary): Weights of the SVM\n",
-    "            b (float): Bias of the SVM\n",
-    "        \"\"\"\n",
     "        self.w = pd.Series(w)\n",
     "        self.b = float(b)\n",
     "\n",
-    "    # Call method to compute the decision function\n",
+    "    # Call method to compute the decision function using the input data X\n",
     "    def __call__(self, X):\n",
-    "        \"\"\"\n",
-    "        Args:\n",
-    "            X (pandas.DataFrame): Input data\n",
-    "\n",
-    "        Returns:\n",
-    "            numpy.array: Array of decision function values\n",
-    "        \"\"\"\n",
     "        return np.sign(X.dot(self.w) + self.b)\n",
     "\n",
-    "    # Representation method for the Linear SVM class\n",
+    "    # String representation method for the Linear SVM class\n",
     "    def __repr__(self):\n",
-    "        \"\"\"\n",
-    "        Returns:\n",
-    "            str: String representation of the Linear SVM\n",
-    "        \"\"\"\n",
     "        return f\"LinearSvm(w = {self.w.to_dict()}, b = {self.b})\""
    ]
   },
@@ -569,21 +535,6 @@
    "outputs": [],
    "source": [
     "def scatter_comparison(X, y, y_pred):\n",
-    "    \"\"\"\n",
-    "    Creates scatter plots comparing actual and predicted outcomes for both training and test sets.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Actual target vector as a Series.\n",
-    "    y_pred : Series\n",
-    "        Predicted target vector as a Series.\n",
-    "\n",
-    "    Returns:\n",
-    "    None\n",
-    "    \"\"\"\n",
-    "\n",
     "    xmin, ymin = X.min()\n",
     "    xmax, ymax = X.max()\n",
     "    xlim = [xmin - 0.05 * (xmax - xmin), xmax + 0.05 * (xmax - xmin)]\n",
@@ -674,7 +625,7 @@
     "\n",
     "The accuracy score alone is not always a reliable metric for evaluating the performance of binary classifiers. For instance, when one outcome is significantly more frequent than the other, a classifier that always predicts the more common outcome without regard to the feature vector can achieve. Moreover, in many applications, the consequences of a false positive can differ from those of a false negative. For these reasons, we seek a more comprehensive set of metrics to compare binary classifiers. A [detailed discussion on this topic](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7) recommends the [Matthews correlation coefficient (MCC)](https://towardsdatascience.com/the-best-classification-metric-youve-never-heard-of-the-matthews-correlation-coefficient-3bf50a2f3e9a) as a reliable performance measure for binary classifiers.\n",
     "\n",
-    "The code below demonstrates an example of a function that evaluates the performance of a binary classifier and returns the Matthews correlation coefficient as its output."
+    "The code below demonstrates an example of a function that evaluates the performance of a binary classifier and returns the Matthews correlation coefficient as its output. The function `validate` calculates and displays the sensitivity, precision, and Matthews correlation coefficient (MCC) for a binary classifier based on its true labels (`y_true`) and predicted labels (`y_pred`)."
    ]
   },
   {
@@ -685,20 +636,6 @@
    "outputs": [],
    "source": [
     "def validate(y_true, y_pred, verbose=True):\n",
-    "    \"\"\"\n",
-    "    This function calculates and displays the sensitivity, precision, and Matthews correlation coefficient\n",
-    "    (MCC) for a binary classifier based on its true labels (y_true) and predicted labels (y_pred).\n",
-    "\n",
-    "    Args:\n",
-    "    y_true (array-like): A list or array containing the true labels of the samples.\n",
-    "    y_pred (array-like): A list or array containing the predicted labels of the samples.\n",
-    "    verbose (bool, optional): If True, the function prints and displays the calculated metrics and\n",
-    "                              confusion matrix. Defaults to True.\n",
-    "\n",
-    "    Returns:\n",
-    "    float: The calculated Matthews correlation coefficient (MCC).\n",
-    "    \"\"\"\n",
-    "\n",
     "    # Calculate the elements of the confusion matrix\n",
     "    true_positives = sum((y_true > 0) & (y_pred > 0))\n",
     "    false_negatives = sum((y_true > 0) & (y_pred < 0))\n",
@@ -1027,23 +964,7 @@
    ],
    "source": [
     "def svm_factory_lp(X, y, lambd=1):\n",
-    "    \"\"\"\n",
-    "    Creates a linear support vector machine (SVM) model using linear programming.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Target vector as a Series.\n",
-    "    lambd : float, optional\n",
-    "        Regularization parameter. Default is 1.\n",
-    "\n",
-    "    Returns:\n",
-    "    LinearSvm :\n",
-    "        A trained linear SVM model.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    m = pyo.ConcreteModel()\n",
+    "    m = pyo.ConcreteModel(\"Linear SVM\")\n",
     "\n",
     "    # Use dataframe columns and index to index variables and constraints\n",
     "    m.P = pyo.Set(initialize=X.columns)\n",
@@ -1123,11 +1044,13 @@
     "\\text{s.t.} \\quad &  z_i \\geq 1 - y_i(w^\\top x_i + b) & \\forall i = 1, \\dots, n \\\\\n",
     "& z_i\\geq 0 & \\forall i = 1, \\dots, n \\\\\n",
     "& w\\in\\mathbb{R}^p \\\\\n",
-    "& b\\in\\mathbb{R} \\\\\n",
+    "& b\\in\\mathbb{R}\n",
     "\\end{align*}\n",
     "$$\n",
     "\n",
-    "where $\\frac{1}{2} \\|\\bar{w}\\|_2^2$ is included to regularize the solution for $w$. Choosing larger values of $c$ will reduce the number and size of misclassifications. The trade-off will be larger weights $w$ and the accompanying risk of over over-fitting the training data. "
+    "where $\\frac{1}{2} \\|\\bar{w}\\|_2^2$ is included to regularize the solution for $w$. Choosing larger values of $c$ will reduce the number and size of misclassifications. The trade-off will be larger weights $w$ and the accompanying risk of over over-fitting the training data. \n",
+    "\n",
+    "The following cell creates a support vector machine (SVM) model using quadratic programming starting from data."
    ]
   },
   {
@@ -1220,23 +1143,7 @@
    ],
    "source": [
     "def svm_factory_qp(X, y, c=1):\n",
-    "    \"\"\"\n",
-    "    Creates a linear support vector machine (SVM) model using quadratic programming.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Target vector as a Series.\n",
-    "    c : float, optional\n",
-    "        Regularization parameter. Default is 1.\n",
-    "\n",
-    "    Returns:\n",
-    "    LinearSvm :\n",
-    "        A trained linear SVM model.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    m = pyo.ConcreteModel()\n",
+    "    m = pyo.ConcreteModel(\"SVM QP\")\n",
     "\n",
     "    # Use dataframe columns and index to index variables and constraints\n",
     "    m.P = pyo.Set(initialize=X.columns)\n",
@@ -1299,8 +1206,8 @@
     "\n",
     "$$\n",
     "\\begin{align*}\n",
-    "\\min_{\\alpha_i}\\ & \\frac{1}{2} \\sum_{i=1}^n\\sum_{j=1}^n \\alpha_i \\alpha_j y_i y_j ( x_i^\\top x_j ) -  \\sum_{i=1}^n \\alpha_i \\\\\n",
-    "\\text{s. t.}\\quad & \\sum_{i=1}^n \\alpha_i y_i = 0  \\\\\n",
+    "\\min \\quad  & \\frac{1}{2} \\sum_{i=1}^n\\sum_{j=1}^n \\alpha_i \\alpha_j y_i y_j ( x_i^\\top x_j ) -  \\sum_{i=1}^n \\alpha_i \\\\\n",
+    "\\text{s.t.}\\quad & \\sum_{i=1}^n \\alpha_i y_i = 0  \\\\\n",
     "& \\alpha_i \\in \\left[0, \\frac{c}{n}\\right] & i = 1, \\dots, n \\\\\n",
     "\\end{align*}\n",
     "$$\n",
@@ -1329,10 +1236,12 @@
     "\n",
     "$$\n",
     "\\begin{align*}\n",
-    "\\min_{\\alpha_i}\\ & \\frac{1}{2} w^\\top w -  1^\\top\\alpha \\\\\n",
-    "\\text{s. t.}\\quad & y^\\top\\alpha = 0 \\\\\n",
-    "& w = F^\\top\\alpha & w\\in\\mathbb{R}^p \\\\\n",
-    "& 0 \\leq \\alpha_i \\leq \\frac{c}{n} & \\alpha\\in\\mathbb{R}^n \\\\\n",
+    "\\min \\quad & \\frac{1}{2} w^\\top w -  1^\\top\\alpha \\\\\n",
+    "\\text{s.t.}\\quad & y^\\top\\alpha = 0 \\\\\n",
+    "& w = F^\\top\\alpha \\\\\n",
+    "& 0 \\leq \\alpha_i \\leq \\frac{c}{n} & i = 1, \\dots, n\\\\\n",
+    "& \\alpha\\in\\mathbb{R}^n\\\\\n",
+    "& w\\in\\mathbb{R}^p.\n",
     "\\end{align*}\n",
     "$$\n",
     "\n",
@@ -1437,24 +1346,7 @@
    ],
    "source": [
     "def svm_factory_dual(X, y, c=1):\n",
-    "    \"\"\"\n",
-    "    Creates a linear support vector machine (SVM) model using the dual formulation\n",
-    "    and quadratic programming.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Target vector as a Series.\n",
-    "    c : float, optional\n",
-    "        Regularization parameter. Default is 1.\n",
-    "\n",
-    "    Returns:\n",
-    "    LinearSvm :\n",
-    "        A trained linear SVM model.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    m = pyo.ConcreteModel()\n",
+    "    m = pyo.ConcreteModel(\"Linear SVM Dual\")\n",
     "\n",
     "    # Use dataframe columns and index to index variables and constraints\n",
     "    m.P = pyo.Set(initialize=X.columns)\n",
@@ -1644,32 +1536,15 @@
    "outputs": [],
    "source": [
     "class KernelSVM:\n",
-    "    \"\"\"\n",
-    "    Kernel Support Vector Machine (SVM) class.\n",
-    "    \"\"\"\n",
-    "\n",
+    "    # Initialize the Kernel SVM with weights and bias\n",
     "    def __init__(self, X, y, a, b, kernel):\n",
-    "        \"\"\"\n",
-    "        Initialize the Kernel SVM with weights and bias.\n",
-    "\n",
-    "        :param X: numpy array or list, training data.\n",
-    "        :param y: numpy array or list, target labels.\n",
-    "        :param a: numpy array or list, alpha values for the support vectors.\n",
-    "        :param b: float, bias value.\n",
-    "        :param kernel: function, kernel function to be used in the SVM.\n",
-    "        \"\"\"\n",
     "        self.X = np.array(X)\n",
     "        self.u = np.multiply(np.array(a), np.array(y))\n",
     "        self.b = b\n",
     "        self.kernel = kernel\n",
     "\n",
+    "    # Call method to compute the decision function using the input dataframe Z\n",
     "    def __call__(self, Z):\n",
-    "        \"\"\"\n",
-    "        Compute the decision function.\n",
-    "\n",
-    "        :param Z: pandas DataFrame, test data.\n",
-    "        :return: pandas Series, predicted labels.\n",
-    "        \"\"\"\n",
     "        K = [\n",
     "            [self.kernel(self.X[i, :], Z.loc[j, :]) for j in Z.index]\n",
     "            for i in range(len(self.X))\n",
@@ -1683,7 +1558,7 @@
    "id": "83eebe9d-9561-4a46-8a35-c4028c3a7739",
    "metadata": {},
    "source": [
-    "The second part of the implementation is a factory function containing the optimization model for training an SVM. Given training data and a kernal function, the factory returns an instance of a kernelized SVM. The default is a linear kernel."
+    "The second part of the implementation is a factory function containing the optimization model for training an SVM. Given training data and a kernal function, the factory returns an instance of a kernelized SVM. The default for the kernal function is a linear kernel. An additional optional argument is the scalar `tol`, which is the tolerance for the eigenvalue threshold."
    ]
   },
   {
@@ -1694,26 +1569,6 @@
    "outputs": [],
    "source": [
     "def svm_factory_kernel(X, y, c=1, tol=1e-8, kernel=lambda x, z: x @ z):\n",
-    "    \"\"\"\n",
-    "    Creates a kernel-based support vector machine (SVM) model.\n",
-    "\n",
-    "    Parameters:\n",
-    "    X : DataFrame\n",
-    "        Feature matrix as a DataFrame.\n",
-    "    y : Series\n",
-    "        Target vector as a Series.\n",
-    "    c : float, optional\n",
-    "        Regularization parameter. Default is 1.\n",
-    "    tol : float, optional\n",
-    "        Tolerance for eigenvalue threshold. Default is 1e-8.\n",
-    "    kernel : callable, optional\n",
-    "        Kernel function that accepts two input vectors and returns a scalar. Default is the linear kernel.\n",
-    "\n",
-    "    Returns:\n",
-    "    kernelSVM : callable\n",
-    "        A trained kernel-based SVM model as a callable function.\n",
-    "    \"\"\"\n",
-    "\n",
     "    # Convert to numpy arrays for speed improvement\n",
     "    n, p = X.shape\n",
     "    X_ = X.to_numpy()\n",