From c0fd2f8e33ad3328484701b7a4b747e89990261c Mon Sep 17 00:00:00 2001 From: "Madson Luiz Dantas Dias (UFC)" Date: Mon, 18 Mar 2024 14:10:53 -0300 Subject: [PATCH] fix: correction of the cosine distance calculation method (#78). --- .pre-commit-config.yaml | 9 ------ fcmeans/main.py | 61 ++++++++++++++++++++++++++++++----------- requirements.txt | 20 ++++++++------ 3 files changed, 56 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8342d0b..f699b44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,15 +16,6 @@ repos: - id: trailing-whitespace - id: mixed-line-ending - id: check-added-large-files - -- repo: https://github.com/psf/black - rev: 22.1.0 - hooks: - - id: black - args: ['--line-length=79'] - files: '(\.pyi?|wscript|ipynb)$' - language_version: python3 - additional_dependencies: [black-nb] - repo: https://github.com/asottile/blacken-docs rev: v1.8.0 hooks: diff --git a/fcmeans/main.py b/fcmeans/main.py index 57d9814..04ffe31 100644 --- a/fcmeans/main.py +++ b/fcmeans/main.py @@ -1,18 +1,20 @@ -from typing import Optional, Dict, Union, Callable from enum import Enum +from typing import Callable, Dict, Optional, Union -from joblib import Parallel, delayed import numpy as np +import tqdm +from joblib import Parallel, delayed from numpy.typing import NDArray from pydantic import BaseModel, ConfigDict, Field, validate_call -import tqdm class DistanceOptions(str, Enum): + """Implemented distances""" euclidean = 'euclidean' minkowski = 'minkowski' cosine = 'cosine' + class FCM(BaseModel): r"""Fuzzy C-means Model @@ -49,7 +51,9 @@ class FCM(BaseModel): trained: bool = False n_jobs: int = Field(1, ge=1) verbose: Optional[bool] = False - distance: Optional[Union[DistanceOptions, Callable]] = DistanceOptions.euclidean + distance: Optional[Union[DistanceOptions, Callable]] = ( + DistanceOptions.euclidean + ) distance_params: Optional[Dict] = {} @validate_call(config=dict(arbitrary_types_allowed=True)) @@ -62,7 +66,9 @@ def fit(self, X: NDArray) -> None: self.rng = np.random.default_rng(self.random_state) n_samples = X.shape[0] self.u = self.rng.uniform(size=(n_samples, self.n_clusters)) - self.u = self.u / np.tile(self.u.sum(axis=1)[np.newaxis].T, self.n_clusters) + self.u = self.u / np.tile( + self.u.sum(axis=1)[np.newaxis].T, self.n_clusters + ) for _ in tqdm.tqdm( range(self.max_iter), desc="Training", disable=not self.verbose ): @@ -85,9 +91,16 @@ def soft_predict(self, X: NDArray) -> NDArray: NDArray: Fuzzy partition array, returned as an array with n_samples rows and n_clusters columns. """ - temp = FCM._dist(X, self._centers, self.distance, self.distance_params) ** (2 / (self.m - 1)) + temp = FCM._dist( + X, + self._centers, + self.distance, + self.distance_params + ) ** (2 / (self.m - 1)) u_dist = Parallel(n_jobs=self.n_jobs)( - delayed(lambda data, col: (data[:, col] / data.T).sum(0))(temp, col) + delayed( + lambda data, col: (data[:, col] / data.T).sum(0) + )(temp, col) for col in range(temp.shape[1]) ) u_dist = np.vstack(u_dist).T @@ -119,17 +132,28 @@ def _is_trained(self) -> bool: return False @staticmethod - def _dist(A: NDArray, B: NDArray, distance: str, distance_params: str) -> NDArray: + def _dist( + A: NDArray, + B: NDArray, + distance: Optional[Union[DistanceOptions, Callable]] = ( + DistanceOptions.euclidean + ), + distance_params: Optional[Dict] = {} + ) -> NDArray: """Compute the distance between two matrices""" - if isinstance(distance, Callable): + if callable(distance): return distance(A, B, distance_params) elif distance == 'minkowski': - return FCM._minkowski(A, B, distance_params.get("p", 1.0)) + if isinstance(distance_params, dict): + p = distance_params.get("p", 1.0) + else: + p = 1.0 + return FCM._minkowski(A, B, p) elif distance == 'cosine': - return FCM._cosine_similarity(A, B) + return FCM._cosine(A, B) else: return FCM._euclidean(A, B) - + @staticmethod def _euclidean(A: NDArray, B: NDArray) -> NDArray: """Compute the euclidean distance between two matrices""" @@ -139,13 +163,18 @@ def _euclidean(A: NDArray, B: NDArray) -> NDArray: def _minkowski(A: NDArray, B: NDArray, p: float) -> NDArray: """Compute the minkowski distance between two matrices""" return (np.einsum("ijk->ij", (A[:, None, :] - B) ** p)) ** (1/p) - + @staticmethod def _cosine_similarity(A: NDArray, B: NDArray) -> NDArray: """Compute the cosine similarity between two matrices""" - p1 = np.sqrt(np.sum(A**2,axis=1))[:,np.newaxis] - p2 = np.sqrt(np.sum(B**2,axis=1))[np.newaxis,:] - return np.dot(A,B.T) / (p1*p2) + p1 = np.sqrt(np.sum(A**2, axis=1))[:, np.newaxis] + p2 = np.sqrt(np.sum(B**2, axis=1))[np.newaxis, :] + return np.dot(A, B.T) / (p1*p2) + + @staticmethod + def _cosine(A: NDArray, B: NDArray) -> NDArray: + """Compute the cosine distance between two matrices""" + return np.abs(1 - FCM._cosine_similarity(A, B)) @staticmethod def _next_centers(X: NDArray, u: NDArray, m: float): diff --git a/requirements.txt b/requirements.txt index f5f8215..463785d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,11 @@ -click==8.0.4; python_version >= "3.6" -colorama==0.4.4; python_version >= "3.6" and python_full_version < "3.0.0" and platform_system == "Windows" or platform_system == "Windows" and python_version >= "3.6" and python_full_version >= "3.5.0" -joblib==1.2.0 -numpy==1.22.2; python_version >= "3.8" -pydantic==1.9.0; python_full_version >= "3.6.1" -tabulate==0.8.9 -tqdm==4.64.1 -typer==0.4.0; python_version >= "3.6" -typing-extensions==4.1.1; python_version >= "3.6" and python_full_version >= "3.6.1" +annotated-types==0.6.0 ; python_version >= "3.9" and python_version < "4.0" +click==8.1.7 ; python_version >= "3.9" and python_version < "4.0" +colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and platform_system == "Windows" +joblib==1.3.2 ; python_version >= "3.9" and python_version < "4.0" +numpy==1.26.4 ; python_version >= "3.9" and python_version < "4.0" +pydantic-core==2.16.3 ; python_version >= "3.9" and python_version < "4.0" +pydantic==2.6.4 ; python_version >= "3.9" and python_version < "4.0" +tabulate==0.8.10 ; python_version >= "3.9" and python_version < "4.0" +tqdm==4.66.2 ; python_version >= "3.9" and python_version < "4.0" +typer==0.9.0 ; python_version >= "3.9" and python_version < "4.0" +typing-extensions==4.10.0 ; python_version >= "3.9" and python_version < "4.0"