Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancing K-Means initialization options #286

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions include/algorithms/public/SKMeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ under the European Union’s Horizon 2020 research and innovation programme
#include "../../data/TensorTypes.hpp"
#include <Eigen/Core>
#include <queue>
#include <random>
#include <string>

namespace fluid {
Expand All @@ -27,8 +28,16 @@ class SKMeans : public KMeans
{

public:

enum Initializer {
// Random partition assigns points to random clusters at init
Random_Partition,
//'Forgy' initializes means with k random data points
Forgy
};

void train(const FluidDataSet<std::string, double, 1>& dataset, index k,
index maxIter)
index maxIter, unsigned initialize )
{
using namespace Eigen;
using namespace _impl;
Expand All @@ -41,14 +50,14 @@ class SKMeans : public KMeans
{
mK = k;
mDims = dataset.pointSize();
initMeans(dataPoints);
initMeans(dataPoints, initialize);
}

while (maxIter-- > 0)
{
mEmbedding = mMeans.matrix() * dataPointsT;
auto assignments = assignClusters(mEmbedding);
if (!changed(assignments)) { break; }
if (mAssignments.rows() && !changed(assignments)) { break; }
else
mAssignments = assignments;
updateEmbedding();
Expand All @@ -69,19 +78,34 @@ class SKMeans : public KMeans
}

private:

void initMeans(Eigen::MatrixXd& dataPoints)
void initMeans(Eigen::MatrixXd& dataPoints, unsigned initializer)
{
using namespace Eigen;
mMeans = ArrayXXd::Zero(mK, mDims);
mAssignments =
((0.5 + (0.5 * ArrayXd::Random(dataPoints.rows()))) * (mK - 1))
.round()
.cast<int>();
mEmbedding = MatrixXd::Zero(mK, dataPoints.rows());
for (index i = 0; i < dataPoints.rows(); i++)
mEmbedding(mAssignments(i), i) = 1;
computeMeans(dataPoints);

switch (initializer)
{
default:
case Initializer::Random_Partition:
mAssignments =
((0.5 + (0.5 * ArrayXd::Random(dataPoints.rows()))) * (mK - 1))
.round()
.cast<int>();
mEmbedding = MatrixXd::Zero(mK, dataPoints.rows());
for (index i = 0; i < dataPoints.rows(); i++)
mEmbedding(mAssignments(i), i) = 1;
computeMeans(dataPoints);
break;

case Initializer::Forgy: // means from random selection of data points
ArrayXidx dataIndices =
ArrayXidx::LinSpaced(dataPoints.rows(), 0, dataPoints.rows() - 1);
std::vector<Index> samples(mK);
std::sample(dataIndices.begin(), dataIndices.end(), samples.begin(), mK,
std::mt19937{std::random_device{}()});
mMeans = dataPoints(samples, Eigen::all);
break;
}
}

void updateEmbedding()
Expand Down
12 changes: 7 additions & 5 deletions include/clients/nrt/SKMeansClient.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ namespace fluid {
namespace client {
namespace skmeans {

enum { kName, kNumClusters, kThreshold, kMaxIter };
enum { kName, kNumClusters, kThreshold, kMaxIter, kInit };

constexpr auto SKMeansParams = defineParameters(
StringParam<Fixed<true>>("name", "Name"),
LongParam("numClusters", "Number of Clusters", 4, Min(0)),
FloatParam("encodingThreshold", "Encoding Threshold", 0.25, Min(0), Max(1)),
LongParam("maxIter", "Max number of Iterations", 100, Min(1)));
LongParam("maxIter", "Max number of Iterations", 100, Min(1)),
EnumParam("initialize","Initialize method",0, "Random Assignment", "Sampled Means")
);

class SKMeansClient : public FluidBaseClient,
OfflineIn,
Expand Down Expand Up @@ -79,7 +81,7 @@ class SKMeansClient : public FluidBaseClient,
if (dataSet.size() == 0) return Error<IndexVector>(EmptyDataSet);
if (k <= 1) return Error<IndexVector>(SmallK);
if(mTracker.changed(k)) mAlgorithm.clear();
mAlgorithm.train(dataSet, k, maxIter);
mAlgorithm.train(dataSet, k, maxIter, get<kInit>());
IndexVector assignments(dataSet.size());
mAlgorithm.getAssignments(assignments);
return getCounts(assignments, k);
Expand All @@ -100,7 +102,7 @@ class SKMeansClient : public FluidBaseClient,
if (k <= 1) return Error<IndexVector>(SmallK);
if (maxIter <= 0) maxIter = 100;
if(mTracker.changed(k)) mAlgorithm.clear();
mAlgorithm.train(dataSet, k, maxIter);
mAlgorithm.train(dataSet, k, maxIter, get<kInit>());
IndexVector assignments(dataSet.size());
mAlgorithm.getAssignments(assignments);
StringVectorView ids = dataSet.getIds();
Expand Down Expand Up @@ -171,7 +173,7 @@ class SKMeansClient : public FluidBaseClient,
if (k <= 1) return Error<IndexVector>(SmallK);
if (maxIter <= 0) maxIter = 100;
if(mTracker.changed(k)) mAlgorithm.clear();
mAlgorithm.train(dataSet, k, maxIter);
mAlgorithm.train(dataSet, k, maxIter,get<kInit>());
IndexVector assignments(dataSet.size());
mAlgorithm.getAssignments(assignments);
encode(srcClient, dstClient);
Expand Down
Loading