Skip to content

Commit

Permalink
Merge branch 'imp_dbscan_ex' into 'main'
Browse files Browse the repository at this point in the history
Implement DBSCAN clustering.

See merge request zhangc/mcpevent2hist!16
  • Loading branch information
KedoKudo committed Feb 10, 2023
2 parents 17ba90e + 5dbfd65 commit 3c3a38f
Show file tree
Hide file tree
Showing 6 changed files with 326 additions and 24 deletions.
3 changes: 3 additions & 0 deletions sophiread/environment_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ dependencies:
- gtest
- qwt
- eigen
- mlpack
- cereal


# Not Windows, OpenGL implementation:
- mesalib>=18.0.0
Expand Down
2 changes: 2 additions & 0 deletions sophiread/environment_mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ dependencies:
- gtest
- qwt
- eigen
- mlpack
- cereal

# Not Windows, OpenGL implementation:
- mesalib>=18.0.0
Expand Down
43 changes: 35 additions & 8 deletions sophiread/include/dbscan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,41 @@
#include "clustering.h"

class DBSCAN : public ClusteringAlgorithm {
public:
DBSCAN(double eps, int min_samples)
: m_eps(eps), m_min_samples(min_samples){};
public:
DBSCAN(double eps_time, size_t min_points_time, double eps_xy, size_t min_points_xy)
: m_eps_time(eps_time), m_min_points_time(min_points_time), m_eps_xy(eps_xy), m_min_points_xy(min_points_xy){};
~DBSCAN() = default;
public:
void set_method(std::string method) { m_method = method; };
void reset();
std::vector<int> get_cluster_labels();
void fit(const std::vector<Hit>& hits);
std::vector<NeutronEvent> get_events(const std::vector<Hit>& hits);
~DBSCAN() = default;

private:
double m_eps; // The maximum distance between two points
double m_min_samples; // The minimum number of points in a cluster
private:
class TimeClusterInfo
{
public:
TimeClusterInfo();
TimeClusterInfo(const double time, const size_t xy_index);
~TimeClusterInfo() = default;
public:
double m_time_mean;
double m_time_sum;
double m_time_min;
double m_time_max;
std::vector<size_t> m_time_cluster_xy_indexes; // wrt input hits vector
};
void fit1D(std::vector<double> &data, size_t &number_of_clusters, std::vector<size_t> &labels, std::vector<double> &centroids);
void fit2D(std::vector<std::pair<double,double>> &data, size_t &number_of_clusters, std::vector<size_t> &labels,
std::vector<std::pair<double,double>> &centroids);
void mergeTimeClusters1D(std::vector<TimeClusterInfo>& input_infos, std::vector<TimeClusterInfo>& merged_infos);
private:
std::string m_method{"centroid"}; // method for centroid
double m_eps_time; // The maximum distance between two time points
size_t m_min_points_time; // The minimum number of points in a time cluster
double m_eps_xy; // The maximum distance between two XY points
size_t m_min_points_xy; // The minimum number of points in an XY cluster
std::vector<NeutronEvent> m_events;
const size_t m_max_hit_chunk_size = 2e6;
};

20 changes: 15 additions & 5 deletions sophiread/sophiread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@
*
*/
#include <unistd.h>

#include <fstream>
#include <iostream>

#include "abs.h"
#include "tpx3.h"
#include "abs.h"
#include "dbscan.h"

int main(int argc, char *argv[]) {
// processing command line arguments
std::string in_tpx3;
std::string out_hits;
std::string out_events;
bool verbose = false;
bool use_abs_algorithm = true;
int opt;

// help message string
Expand Down Expand Up @@ -61,8 +61,18 @@ int main(int argc, char *argv[]) {
auto hits = readTimepix3RawData(in_tpx3);

// clustering and fitting
ClusteringAlgorithm *alg = new ABS(5.0);
alg->set_method("fast_gaussian");
ClusteringAlgorithm *alg;
if(use_abs_algorithm) {
alg = new ABS(5.0);
alg->set_method("fast_gaussian");
}
else
{
// parameters for DBSCAN were chosen based on the results from the frames_pinhole_3mm_1s_RESOLUTION_000001.tpx3 file
alg = new DBSCAN(3.0/*eps time*/, 10/*min_points time*/, 2.0/*eps xy*/, 5/*min_points xy*/);
alg->set_method("centroid");
}

alg->fit(hits);
auto labels = alg->get_cluster_labels();
// print out labeled hits
Expand Down
250 changes: 240 additions & 10 deletions sophiread/src/dbscan.cpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,253 @@
#include <iostream>
#include <mlpack.hpp>
#include "dbscan.h"

#include <iostream>
DBSCAN::TimeClusterInfo::TimeClusterInfo() : m_time_mean(0.), m_time_sum(0.),m_time_min(DBL_MAX), m_time_max(DBL_MIN) {
m_time_cluster_xy_indexes = std::vector<size_t>();
}

DBSCAN::TimeClusterInfo::TimeClusterInfo(const double time, const size_t xy_index) : m_time_mean(time), m_time_sum(time),m_time_min(time), m_time_max(time) {
m_time_cluster_xy_indexes = std::vector<size_t>{xy_index};
}

/**
* @brief Fit the DBSCAN algorithm to the data
* @brief Run DBSCAN clustering algorithm on the hits
*
* @param hits
* @param hits :: hits
*/
void DBSCAN::fit(const std::vector<Hit>& hits) {
std::cout << "DBSCAN to be implemented." << std::endl;
void DBSCAN::fit(const std::vector<Hit>& hits) {
reset();
size_t numHits = hits.size();
std::cout << "Total number of hits: " << numHits << std::endl;
if (hits.size()==0)
return;

size_t max_number_of_hits = hits.size(); // how many hits we want to process in total
std::cout << "Maximum number of hits to be processed: " << max_number_of_hits << std::endl;
std::cout << "Maximum chunk size (hits): " << m_max_hit_chunk_size << std::endl;

if (max_number_of_hits <= m_max_hit_chunk_size)
std::cout << "Fitting time clusters (1D DBSCAN) on all hits" << std::endl;
else
std::cout << "Fitting time clusters (1D DBSCAN) on chunks of hits..." << std::endl;

size_t chunk_size{0}; // either max_chunk_size or the number of unprocessed hits, whichever is smaller
std::vector<TimeClusterInfo> all_time_cluster_infos;
size_t hit_offset{0}; // for example, if the first chunk had N hits, the second chunk will have an offset of N, etc.
while(true) {
// make a chunk
chunk_size = std::min(m_max_hit_chunk_size, max_number_of_hits-hit_offset);
std::vector<double> chunk;
std::transform(hits.begin()+hit_offset, hits.begin() + hit_offset + chunk_size, std::back_inserter(chunk), [](Hit const& h){ return h.getTOA_ns(); });

// run 1D time clustering on the chunk
std::vector<size_t> labels;
std::vector<double> centroids_1D;
size_t number_of_clusters{0};
fit1D(chunk, number_of_clusters, labels, centroids_1D);

std::vector<TimeClusterInfo> time_cluster_infos;
time_cluster_infos.resize(number_of_clusters);
std::vector<TimeClusterInfo> non_cluster_infos; // these unassigned data points may still change their status later, during merging of chunks

// set the time mean of each new info from the centroids vector
for(size_t jj = 0; jj < number_of_clusters; jj++)
time_cluster_infos[jj].m_time_mean = centroids_1D[jj];

for(size_t ii=0; ii < labels.size(); ii++){
size_t label = labels[ii];
if(label == SIZE_MAX || label > number_of_clusters - 1)
{
non_cluster_infos.emplace_back(TimeClusterInfo(chunk[ii],ii+hit_offset));
continue;
}
TimeClusterInfo &info = time_cluster_infos[label];
info.m_time_cluster_xy_indexes.push_back(ii+hit_offset);
info.m_time_sum += chunk[ii];
info.m_time_min = std::min(info.m_time_min, chunk[ii]);
info.m_time_max = std::max(info.m_time_max, chunk[ii]);
}
// append non-cluster infos considering them as clusters with a single data point each
time_cluster_infos.insert(time_cluster_infos.end(), non_cluster_infos.begin(), non_cluster_infos.end());

// append the new infos to the total infos
all_time_cluster_infos.insert(all_time_cluster_infos.end(), time_cluster_infos.begin(), time_cluster_infos.end());

hit_offset += chunk_size;
if(max_number_of_hits-hit_offset==0)
break; // processed all hits
}

if (all_time_cluster_infos.empty())
return;

// merge all chunk results
std::vector<TimeClusterInfo> merged_time_cluster_infos;
mergeTimeClusters1D(all_time_cluster_infos, merged_time_cluster_infos);
//std::cout << "Number of time clusters before cluster size check: " << merged_time_cluster_infos.size() << std::endl;

// run 2D clustering of XY points for each time cluster satisfying the min_points size
assert(m_events.empty()); // must run reset() before fitting

std::cout << "Fitting XY clusters (2D DBSCAN) on every time cluster..." << std::endl;
std::cout << "Eps: " << m_eps_xy << "; min_points: " << m_min_points_xy << std::endl;
size_t number_of_valid_time_clusters{0};

for(auto & info : merged_time_cluster_infos) {
if(info.m_time_cluster_xy_indexes.size()<m_min_points_time)
continue;
std::vector<std::pair<double,double>> xy_points;
for(auto & index : info.m_time_cluster_xy_indexes)
xy_points.push_back( std::pair<double,double>(hits[index].getX(),hits[index].getY()) );

number_of_valid_time_clusters++;

std::vector<size_t> labels;
std::vector<std::pair<double,double>> centroids_2D;
size_t number_of_clusters{0};
fit2D(xy_points, number_of_clusters, labels, centroids_2D);

std::map<size_t,size_t> label_counts; // label vs. number of xy points with that label
for(size_t ii=0; ii < labels.size(); ii++){
size_t label = labels[ii];
if(label == SIZE_MAX || label > number_of_clusters - 1)
continue;
if (label_counts.count(label)==0)
label_counts[label]=0;
label_counts[label]++;
}

// append events
for (auto const& label_count : label_counts)
m_events.emplace_back(NeutronEvent(centroids_2D[label_count.first].first/*X*/,
centroids_2D[label_count.first].second/*Y*/, info.m_time_mean, label_count.second));
}

std::cout << "Final number of time clusters: " << number_of_valid_time_clusters << std::endl;
}

/**
* @brief Predict the clusters by retrieving the labels of the hits
* @brief Merge time cluster infos
*
* @param hits
* @return std::vector<int>
* @param input_infos :: unmerged time cluster infos
* @param merged_infos :: (output) merged time cluster infos
*/
void DBSCAN::mergeTimeClusters1D(std::vector<TimeClusterInfo>& input_infos, std::vector<TimeClusterInfo>& merged_infos)
{
std::cout << "Merging time clusters..." << std::endl;
// before merging, sort the infos by the min time
std::sort(input_infos.begin(), input_infos.end(), [](const TimeClusterInfo & a, const TimeClusterInfo & b) {return a.m_time_min < b.m_time_min;});

merged_infos.clear();
std::vector<TimeClusterInfo>::const_iterator it = input_infos.begin();
TimeClusterInfo current = *(it)++;
TimeClusterInfo next;
while (it != input_infos.end()) {
next = *(it);
if (current.m_time_max > next.m_time_min || next.m_time_min - current.m_time_max <= m_eps_time) { // checking if the clusters should be merged
current.m_time_max = std::max(current.m_time_max, next.m_time_max);
current.m_time_min = std::min(current.m_time_min, next.m_time_min);
current.m_time_cluster_xy_indexes.insert(current.m_time_cluster_xy_indexes.end(), next.m_time_cluster_xy_indexes.begin(),
next.m_time_cluster_xy_indexes.end());
current.m_time_sum += next.m_time_sum;
} else {
current.m_time_mean = current.m_time_sum/current.m_time_cluster_xy_indexes.size();
merged_infos.push_back(current);
current = *(it);
}
it++;
}
current.m_time_mean = current.m_time_sum/current.m_time_cluster_xy_indexes.size();
merged_infos.push_back(current);
// for(auto & info : merged_infos )
// std::cout << info.m_time_min << "," << info.m_time_mean << "," << info.m_time_max << "," << info.m_time_cluster_xy_indexes.size() << std::endl;
}

/**
* @brief Run DBSCAN algorithm in 1D
*
* @param data :: input data
* @param number_of_clusters :: (output) number of found clusters
* @param labels :: (output) cluster labels
* @param centroids :: (output) cluster centroids
*/
void DBSCAN::fit1D(std::vector<double> &data, size_t &number_of_clusters, std::vector<size_t> &labels,
std::vector<double> &centroids) {
// create an arma matrix from the data vector
arma::mat data_mat(&data[0], 1/*nrows*/, data.size()/*ncols*/, false /*arma::mat will re-use the input data vector memory*/);

// create the dbscan object
mlpack::DBSCAN<mlpack::RangeSearch<>,mlpack::OrderedPointSelection> dbs(m_eps_time, m_min_points_time);

// run mlpack clustering
arma::Row<size_t> labels_row;
arma::mat centroids_mat;
number_of_clusters = dbs.Cluster(data_mat, labels_row, centroids_mat);

std::cout << "Eps: " << m_eps_time << "; min_points: " << m_min_points_time << "; time clusters: " << number_of_clusters << std::endl;

// fill in the labels vector from the arma label row
labels_row.for_each( [&labels](size_t& val) { /*std::cout << (val==SIZE_MAX?"-1":std::to_string(val)) << ",";*/ labels.push_back(val); } );

// fill in the centroids vector from the arma centroids matrix
centroids_mat.for_each([&centroids](arma::mat::elem_type& val){/*std::cout << val << ","*;*/centroids.push_back(val);});
//std::cout << "\n";
}

/**
* @brief Run DBSCAN algorithm in 2D
*
* @param data :: input data
* @param number_of_clusters :: (output) number of found clusters
* @param labels :: (output) cluster labels
* @param centroids :: (output) cluster centroids
*/
void DBSCAN::fit2D(std::vector<std::pair<double,double>> &data, size_t &number_of_clusters, std::vector<size_t> &labels,
std::vector<std::pair<double,double>> &centroids){

// create an arma matrix from the data vector
arma::mat data_mat(&(data[0].first), 2/*nrows*/, data.size()/*ncols*/, false /*arma::mat will re-use the input data vector memory*/);

// create the dbscan object
mlpack::DBSCAN<mlpack::RangeSearch<>,mlpack::OrderedPointSelection> dbs(m_eps_xy, m_min_points_xy);

// run mlpack clustering
arma::Row<size_t> labels_row;
arma::mat centroids_mat;
number_of_clusters = dbs.Cluster(data_mat, labels_row, centroids_mat);

//std::cout << "Clusters: " << number_of_clusters << ", ";

// fill in the labels vector from the arma label row
labels_row.for_each( [&labels](size_t& val) { /*std::cout << (val==SIZE_MAX?"-1":std::to_string(val)) << ",";*/ labels.push_back(val); } );
// fill in the centroids vector from the arma centroids matrix
centroids_mat.each_col([&centroids](const arma::vec& b){centroids.push_back(std::pair<double,double>(b[0],b[1]));});
}

/**
* @brief Run DBSCAN clustering on the hits and return netron events
*
* @param hits :: input hits
* @return neutron events
*/
std::vector<NeutronEvent> DBSCAN::get_events(const std::vector<Hit>& hits) {
std::cout << "DBSCAN to be implemented." << std::endl;
return std::vector<NeutronEvent>();
reset();
fit(hits);
std::cout << "Total number of events: " << m_events.size() << std::endl;
return m_events;
}

/**
* @brief Reset DBSCAN clustering
*/
void DBSCAN::reset(){
m_events.clear();
}

/**
* @brief Get DBSCAN cluster labels for each hit
*/
std::vector<int> DBSCAN::get_cluster_labels() {
std::cout << "DBSCAN::get_cluster_labels() method hasn't been implemented yet" << std::endl;
return std::vector<int>();
}
Loading

0 comments on commit 3c3a38f

Please sign in to comment.