From 14ec48f07aab9f53006f5886050225e7a8cfab98 Mon Sep 17 00:00:00 2001 From: ken1000minus7 Date: Fri, 10 Feb 2023 02:12:57 +0530 Subject: [PATCH 1/2] Added DBSCAN algorithm --- CMakeLists.txt | 5 +- docs/methods/cluster/DBSCAN.md | 52 +++++++++++ examples/cluster/DBSCAN.cpp | 26 ++++++ src/slowmokit.hpp | 1 + src/slowmokit/methods/cluster/DBSCAN.hpp | 12 +++ .../methods/cluster/DBSCAN/DBSCAN.cpp | 88 +++++++++++++++++++ .../methods/cluster/DBSCAN/DBSCAN.hpp | 82 +++++++++++++++++ 7 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 docs/methods/cluster/DBSCAN.md create mode 100644 examples/cluster/DBSCAN.cpp create mode 100644 src/slowmokit/methods/cluster/DBSCAN.hpp create mode 100644 src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp create mode 100644 src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b423799..a53e2de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,4 +60,7 @@ add_library(slowmokit src/slowmokit/methods/metrics/recall.hpp src/slowmokit/methods/metrics/recall.cpp src/slowmokit/methods/metrics/mean_squared_error.hpp - src/slowmokit/methods/metrics/mean_squared_error.cpp) + src/slowmokit/methods/metrics/mean_squared_error.cpp + src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp + src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp + src/slowmokit/methods/cluster/DBSCAN.hpp) diff --git a/docs/methods/cluster/DBSCAN.md b/docs/methods/cluster/DBSCAN.md new file mode 100644 index 0000000..a01d86d --- /dev/null +++ b/docs/methods/cluster/DBSCAN.md @@ -0,0 +1,52 @@ +# DBSCAN + +DBSCAN stands for **D**ensity **B**ased **S**patial **C**lustering of **A**pplications with **N**oise + +The model clusters the given training set based on density of the given data points i.e. a point belongs to a cluster based on how close it is to its neighbouring points. This model is capable of finding arbitrary shaped clusters and identifying outliers. + +## Parameters + +| Name | Definition | Defaults | Type | +|-------------| ------------------------------------------------------------------------------------------- |----------|---------------| +| `eps` | Measure of how close a point should be to be considered in the vicinity of another point | 0.5 | `long double` | +| `minSamples` | Minimum number of points that should lie in the vicinity of a point for it to be considered a core point | 5 | `int` | + +## Attributes + +| Name | Definition | Shape | +|--------|------------------------------------------------------------------------------------|-----------------------------------| +| `labels` | Labels assigned to each data point of the training set fitted into the model | No of data points in training set | + +## Methods + +| Name | Definition | Return value | +|--------------------------------------|----------------------------------------|---------------| +| `fit(std::vector> x)` | Fits and clusters the given training set | `void` | +| `fitPredict(vector x)` | Fits and clusters the given training set and returns the labels assigned to each data point | `vector` | +| `getLabels()` | Returns the labels assigned to each data point of the training set fitted into the model | `vector` | + +## Example + +```cpp +DBSCAN db(0.6, 4); +std::vector> x = { + {1, 2}, + {3, 4}, + {2.5, 4}, + {1.5, 2.5}, + {3, 5}, + {2.8, 4.5}, + {2.5, 4.5}, + {1.2, 2.5}, + {1, 3}, + {1, 5}, + {1, 2.5}, + {5, 6}, + {4, 3} +}; +std::vector labels = db.fitPredict(x); +std::cout << "X Y Cluster\n"; +for(int i = 0; i < x.size(); i++) { + std::cout << x[i][0] << " " << x[i][1] << " " << labels[i] << "\n"; +} +``` diff --git a/examples/cluster/DBSCAN.cpp b/examples/cluster/DBSCAN.cpp new file mode 100644 index 0000000..077f049 --- /dev/null +++ b/examples/cluster/DBSCAN.cpp @@ -0,0 +1,26 @@ +//#include "../../src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp" +// +//int main() { +// DBSCAN db(0.6, 4); +// std::vector> x = { +// {1, 2}, +// {3, 4}, +// {2.5, 4}, +// {1.5, 2.5}, +// {3, 5}, +// {2.8, 4.5}, +// {2.5, 4.5}, +// {1.2, 2.5}, +// {1, 3}, +// {1, 5}, +// {1, 2.5}, +// {5, 6}, +// {4, 3} +// }; +// std::vector labels = db.fitPredict(x); +// std::cout << "X Y Cluster\n"; +// for(int i = 0; i < x.size(); i++) { +// std::cout << x[i][0] << " " << x[i][1] << " " << labels[i] << "\n"; +// } +// return 0; +//} \ No newline at end of file diff --git a/src/slowmokit.hpp b/src/slowmokit.hpp index 6b8ac82..8fdc669 100644 --- a/src/slowmokit.hpp +++ b/src/slowmokit.hpp @@ -20,5 +20,6 @@ #include "slowmokit/methods/neighbors/bernoulli_nb.hpp" #include "slowmokit/methods/neighbors/gaussian_nb.hpp" #include "slowmokit/methods/neighbors/knn.hpp" +#include "slowmokit/methods/cluster/DBSCAN.hpp" #endif // SLOWMOKIT_HPP diff --git a/src/slowmokit/methods/cluster/DBSCAN.hpp b/src/slowmokit/methods/cluster/DBSCAN.hpp new file mode 100644 index 0000000..cd13b5f --- /dev/null +++ b/src/slowmokit/methods/cluster/DBSCAN.hpp @@ -0,0 +1,12 @@ +/** + * @file methods/cluster/DBSCAN.hpp + * + * Easy include for DBSCAN algorithm + */ + +#ifndef SLOWMOKIT_DBSCAN_HPP +#define SLOWMOKIT_DBSCAN_HPP + +#include "DBSCAN/DBSCAN.hpp" + +#endif //SLOWMOKIT_DBSCAN_HPP diff --git a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp new file mode 100644 index 0000000..c6f68b9 --- /dev/null +++ b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.cpp @@ -0,0 +1,88 @@ +/** + * @file methods/neighbors/DBSCAN/DBSCAN.cpp + * + * Implementation of the DBSCAN class + */ + +#include "DBSCAN.hpp" + +template +DBSCAN::DBSCAN(long double eps, int minSamples) { + if(eps < 0 || minSamples < 0) { + throw std::invalid_argument("Values can't be negative"); + } + this->eps = eps; + this->minSamples = minSamples; +} + +template +long double DBSCAN::euclideanDistance(std::vector p1, std::vector p2) { + long double distance = 0.0; + if(p1.size() != p2.size()) { + throw std::invalid_argument("Feature vectors are unequal in size"); + } + int n = p1.size(); + for(int i = 0; i < n; i++) { + distance += (long double) (p1[i] - p2[i]) * (p1[i] - p2[i]); + } + return sqrtl(distance); +} + +template +void DBSCAN::cluster(int i, std::vector &core, std::vector> &neighbours, int &label) { + if(labels[i] != -1) { + return; + } + labels[i] = label; + if(core[i] != 0) { + for(int j : neighbours[i]) { + cluster(j, core, neighbours, label); + } + } +} + +template +void DBSCAN::fit(std::vector> x) { + int n = x.size(); + + std::vector core(n); + std::vector> neighbours(n, std::vector()); + + labels = std::vector(n, -1); + + for(int i = 0; i < n; i++) { + std::vector neighbourIndices; + for(int j = 0; j < n; j++) { + if(i == j) { + continue; + } + if(euclideanDistance(x[i], x[j]) <= eps) { + neighbourIndices.push_back(j); + } + } + int const samples = neighbourIndices.size(); + if(samples >= minSamples) { + core[i]++; + neighbours[i] = neighbourIndices; + } + } + int clusters = 0; + for(int i = 0; i < n; i++) { + if(core[i] == 0 || labels[i] != -1) { + continue; + } + cluster(i, core, neighbours, clusters); + clusters++; + } +} + +template +std::vector DBSCAN::fitPredict(std::vector> x) { + fit(x); + return labels; +} + +template +std::vector DBSCAN::getLabels() { + return labels; +} \ No newline at end of file diff --git a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp new file mode 100644 index 0000000..ac0b6e7 --- /dev/null +++ b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp @@ -0,0 +1,82 @@ +/** + * @file methods/cluster/DBSCAN/DBSCAN.hpp + * + * The header file for DBSCAN + */ +#ifndef SLOWMOKIT_DBSCAN_HPP +#define SLOWMOKIT_DBSCAN_HPP + +#include "core.hpp" +/** + * Class carrying implementation of the DBSCAN clustering algorithm + * @tparam T type of the data to be clustered + */ +template +class DBSCAN +{ +private: + + /** + * Measure of how close a point should be to be considered in the vicinity of another point, default value is 0.5 + */ + long double eps; + + /** + * Minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default value is 5 + */ + int minSamples; + + /** + * Labels assigned to each data point after fitting, the values range from 0 to clusters - 1, outliers are assigned -1 + */ + std::vector labels; + + /** + * Evaluates the euclidean distance between two feature vectors + * @param p1 the first feature vector + * @param p2 the second feature vector + * @return the euclidean distance between the two vectors + * @throws invalid_argument exception when the feature vectors are unequal in size + */ + long double euclideanDistance(std::vector p1, std::vector p2); + + /** + * Helper function for recursively clustering the points using DBSCAN + * @param i index of the the point that is to be assigned a cluster + * @param core boolean vector indicating whether a point is a core point or not + * @param neighbours 2D vector carrying neighbours of each of the core points + * @param label label of the cluster to be assigned to this point + */ + void cluster(int i, std::vector &core, std::vector> &neighbours, int &label); + +public: + + /** + * Constructor for creating an instance of the DBSCAN class + * @param eps measure of how close a point should be to be considered in the vicinity of another point, default is 0.5 + * @param minSamples minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default is 5 + * @throws invalid_argument exception when eps or minSamples is less than 0 + */ + DBSCAN(long double eps = 0.5, int minSamples = 5); + + /** + * Fits and clusters the given training set + * @param x list of feature vectors to be clustered + */ + void fit(std::vector> x); + + /** + * Fits and clusters the given training set and returns the labels assigned to each data point + * @param x list of feature vectors + * @return vector of labels assigned to each data point + */ + std::vector fitPredict(std::vector> x); + + /** + * Returns the labels assigned to each data point of the training set fitted into the model + * @return vector of labels assigned to each data point + */ + std::vector getLabels(); +}; + +#endif //SLOWMOKIT_DBSCAN_HPP From 09a06dd4eda61712687d624f6637058d2bf7a3f4 Mon Sep 17 00:00:00 2001 From: ken1000minus7 Date: Fri, 10 Feb 2023 12:48:57 +0530 Subject: [PATCH 2/2] Updated function signatures --- src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp index ac0b6e7..c671043 100644 --- a/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp +++ b/src/slowmokit/methods/cluster/DBSCAN/DBSCAN.hpp @@ -57,20 +57,20 @@ class DBSCAN * @param minSamples minimum number of points that should lie in the vicinity of a point for it to be considered a core point, default is 5 * @throws invalid_argument exception when eps or minSamples is less than 0 */ - DBSCAN(long double eps = 0.5, int minSamples = 5); + DBSCAN(long double = 0.5, int = 5); /** * Fits and clusters the given training set * @param x list of feature vectors to be clustered */ - void fit(std::vector> x); + void fit(std::vector>); /** * Fits and clusters the given training set and returns the labels assigned to each data point * @param x list of feature vectors * @return vector of labels assigned to each data point */ - std::vector fitPredict(std::vector> x); + std::vector fitPredict(std::vector>); /** * Returns the labels assigned to each data point of the training set fitted into the model