diff --git a/CMakeLists.txt b/CMakeLists.txt index 058ac48..1c0d172 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) -project(Fred LANGUAGES CXX) +project(Fred LANGUAGES CXX C) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -shared") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") @@ -12,15 +12,29 @@ include_directories(${CMAKE_SOURCE_DIR}/include) find_package(PythonInterp REQUIRED) find_package(PythonLibs REQUIRED) -find_package(Boost 1.63 COMPONENTS chrono ${BPY} ${BNPY} REQUIRED) +find_package(Boost 1.63 COMPONENTS system chrono ${BPY} ${BNPY} REQUIRED) find_package(OpenMP REQUIRED) +add_definitions(-D_GLIBCXX_PARALLEL) + include_directories(${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) -link_libraries(${PYTHON_LIBRARIES} ${Boost_LIBRARIES} OpenMP::OpenMP_CXX) +link_libraries(${PYTHON_LIBRARIES} ${Boost_LIBRARIES}) -add_definitions(-D_GLIBCXX_PARALLEL) +if(OpenMP_CXX_FOUND) + link_libraries(OpenMP::OpenMP_CXX) +endif() +if(NOT TARGET OpenMP::OpenMP_CXX) + find_package(Threads REQUIRED) + add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE) + set_property(TARGET OpenMP::OpenMP_CXX + PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS}) + set_property(TARGET OpenMP::OpenMP_CXX + PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS} Threads::Threads) + +endif() +link_libraries(OpenMP::OpenMP_CXX) -PYTHON_ADD_MODULE(Fred +PYTHON_ADD_MODULE(backend src/fred_python_wrapper.cpp src/curve.cpp src/point.cpp diff --git a/Makefile b/Makefile index e87bbbf..2e6ddd0 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,10 @@ pre: - sudo apt install libboost-all-dev - sudo apt-get install python3-setuptools - sudo apt-get install python3-numpy - sudo apt-get install python3-pandas - sudo apt-get install python-setuptools - sudo apt-get install python-numpy - sudo apt-get install python-pandas + sudo apt install -y libboost-all-dev + sudo apt-get install -y python3-setuptools + sudo apt-get install -y python3-numpy + sudo apt-get install -y python-setuptools + sudo apt-get install -y python-numpy + sudo apt-get install -y cmake python3: cd py && python3 ./setup.py install --user diff --git a/README.md b/README.md index 113e0b2..57cdefc 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,27 @@ # Fred ![alt text](https://raw.githubusercontent.com/derohde/Fred/master/logo/logo.png "Fred logo") A fast, scalable and light-weight C++ Fréchet distance library, exposed to python. -## Ingredients +## Ingredients C++ Backend +`import Fred.backend as fred` + - continous Fréchet distance - - signature: `Fred.continuous_frechet(curve1, curve2, approximation_error)` default for approximation_error is 0.001 - - returns: `Fred.Continuous_Frechet_Result` with members `value`, `time_bounds`: running-time for upper and lower bound, `number_searches`: number of free space diagrams built, `time_searches`: running-time for free spaces + - signature: `fred.continuous_frechet(curve1, curve2, approximation_error)` default for approximation_error is 0.001 + - returns: `fred.Continuous_Frechet_Result` with members `value`, `time_bounds`: running-time for upper and lower bound, `number_searches`: number of free space diagrams built, `time_searches`: running-time for free spaces - discrete Fréchet distance - - signature: `Fred.discrete_frechet(curve1, curve2)` - - returns: `Fred.Discrete_Frechet_Result` with members `value` and `time` + - signature: `fred.discrete_frechet(curve1, curve2)` + - returns: `fred.Discrete_Frechet_Result` with members `value` and `time` - discrete k-center clustering (continuous Fréchet) [Without simplification; from **Approximating (k,l)-center clustering for curves**](https://dl.acm.org/doi/10.5555/3310435.3310616) - - signature: `Fred.discrete_kcenter(k, curves, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `with_assignment`: defaults to false; assigns curves to nearest centers if true - - returns: `Fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false + - signature: `fred.discrete_kcenter(k, curves, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `with_assignment`: defaults to false; assigns curves to nearest centers if true + - returns: `fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false - discrete k-median clustering (continuous Fréchet) [Algorithm 6 in **Coresets for (k,l)-Clustering under the Fréchet distance**](https://arxiv.org/pdf/1901.01870.pdf) - - signature: `Fred.discrete_kmedian(k, curves, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `with_assignment`: defaults to false; assigns curves to nearest centers if true - - returns: `Fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false + - signature: `fred.discrete_kmedian(k, curves, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `with_assignment`: defaults to false; assigns curves to nearest centers if true + - returns: `fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false - discrete one-median clustering (continuous Fréchet) via sampling [Section 3 in **Random Projections and Sampling Algorithms for Clustering of High Dimensional Polygonal Curves**](https://papers.nips.cc/paper/9443-random-projections-and-sampling-algorithms-for-clustering-of-high-dimensional-polygonal-curves) - - signature: `Fred.discrete_onemedian_sampling(curves, epsilon_sampling, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `epsilon_sampling`: (1+epsilon) approximation parameter, `with_assignment`: defaults to false; assigns curves to nearest centers if true - - returns: `Fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false + - signature: `fred.discrete_onemedian_sampling(curves, epsilon_sampling, approximation_error, with_assignment)` with parameters `approximation_error`: see continuous Fréchet, `epsilon_sampling`: (1+epsilon) approximation parameter, `with_assignment`: defaults to false; assigns curves to nearest centers if true + - returns: `fred.Clustering_Result` with mebers `value`: objective value, `time`, `assignment`: empty if with_assignment=false - dimension reduction via. gaussian random projection [Section 2 in **Random Projections and Sampling Algorithms for Clustering of High Dimensional Polygonal Curves**](https://papers.nips.cc/paper/9443-random-projections-and-sampling-algorithms-for-clustering-of-high-dimensional-polygonal-curves) - - signature: `Fred.dimension_reduction(curves, epsilon, empirical_constant)` with parameters `epsilon`: (1+epsilon) approximation parameter, `empirical_constant`: use constant of empirical study (faster, but less accurate) - - returns: `Fred.Curves` collection of curves + - signature: `fred.dimension_reduction(curves, epsilon, empirical_constant)` with parameters `epsilon`: (1+epsilon) approximation parameter, `empirical_constant`: use constant of empirical study (faster, but less accurate) + - returns: `fred.Curves` collection of curves ## Installation Get requirements under Ubuntu: `make pre` diff --git a/include/clustering.hpp b/include/clustering.hpp index ed96e1b..804c86e 100644 --- a/include/clustering.hpp +++ b/include/clustering.hpp @@ -54,16 +54,17 @@ struct Clustering_Result { }; -inline void cheap_dist(const curve_size_t i, const curve_size_t j, const Curves &in, std::vector> &distances, const distance_t eps) { +inline void cheap_dist(const curve_size_t i, const curve_size_t j, const Curves &in, std::vector> &distances, + const distance_t eps, const bool round) { if (distances[i][j] < 0) { - const auto dist = Frechet::Continuous::distance(in[i], in[j], eps); + const auto dist = Frechet::Continuous::distance(in[i], in[j], eps, round); distances[j][i] = dist.value; distances[i][j] = dist.value; } } -inline curve_size_t getNearestCenter(const curve_size_t i, const Curves &in, const Centers ¢ers, - std::vector> &distances, const distance_t eps) { +inline curve_size_t nearest_center(const curve_size_t i, const Curves &in, const Centers ¢ers, + std::vector> &distances, const distance_t eps, const bool round) { const auto infty = std::numeric_limits::infinity(); // cost for curve is infinity auto min_cost_elem = infty; @@ -71,7 +72,7 @@ inline curve_size_t getNearestCenter(const curve_size_t i, const Curves &in, con // except there is a center with smaller cost, then choose the one with smallest cost for (curve_size_t j = 0; j < centers.size(); ++j) { - cheap_dist(i, centers[j], in, distances, eps); + cheap_dist(i, centers[j], in, distances, eps, round); if (distances[i][centers[j]] < min_cost_elem) { min_cost_elem = distances[i][centers[j]]; nearest = j; @@ -80,14 +81,15 @@ inline curve_size_t getNearestCenter(const curve_size_t i, const Curves &in, con return nearest; } -inline auto curve_cost(const curve_size_t i, const Curves &in, const Centers ¢ers, std::vector> &distances, const distance_t eps) { +inline auto curve_cost(const curve_size_t i, const Curves &in, const Centers ¢ers, + std::vector> &distances, const distance_t eps, const bool round) { const auto infty = std::numeric_limits::infinity(); // cost for curve is infinity auto min_cost_elem = infty; // except there is a center with smaller cost, then choose the one with smallest cost for (curve_size_t j = 0; j < centers.size(); ++j) { - cheap_dist(i, centers[j], in, distances, eps); + cheap_dist(i, centers[j], in, distances, eps, round); if (distances[i][centers[j]] < min_cost_elem) { min_cost_elem = distances[i][centers[j]]; } @@ -96,18 +98,20 @@ inline auto curve_cost(const curve_size_t i, const Curves &in, const Centers &ce return min_cost_elem; } -inline auto center_cost_sum(const Curves &in, const Centers ¢ers, std::vector> &distances, const distance_t eps) { - double cost = 0.0; +inline auto center_cost_sum(const Curves &in, const Centers ¢ers, + std::vector> &distances, const distance_t eps, const bool round) { + distance_t cost = 0.0; // for all curves for (curve_size_t i = 0; i < in.size(); ++i) { - const auto min_cost_elem = curve_cost(i, in, centers, distances, eps); + const auto min_cost_elem = curve_cost(i, in, centers, distances, eps, round); cost += min_cost_elem; } return cost; } -inline Cluster_Assignment getClusterAssignment(const Curves &in, const Centers ¢ers, std::vector> &distances, const distance_t eps) { +inline Cluster_Assignment cluster_assignment(const Curves &in, const Centers ¢ers, + std::vector> &distances, const distance_t eps, const bool round) { Cluster_Assignment result; const auto k = centers.size(); @@ -115,12 +119,13 @@ inline Cluster_Assignment getClusterAssignment(const Curves &in, const Centers & for (curve_size_t i = 0; i < k; ++i) result.emplace(i, std::vector()); - for (curve_size_t i = 0; i < in.size(); ++i) result[getNearestCenter(i, in, centers, distances, eps)].push_back(i); + for (curve_size_t i = 0; i < in.size(); ++i) result[nearest_center(i, in, centers, distances, eps, round)].push_back(i); return result; } -Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, const distance_t eps, const bool arya = false, const bool with_assignment = false) { +Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, const distance_t eps, + const bool round = true, const bool arya = false, const bool with_assignment = false) { const auto start = boost::chrono::process_real_cpu_clock::now(); Clustering_Result result; @@ -151,7 +156,7 @@ Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, con // all curves for (curve_size_t j = 0; j < in.size(); ++j) { - auto curr_curve_cost = curve_cost(j, in, centers, distances, eps); + auto curr_curve_cost = curve_cost(j, in, centers, distances, eps, round); if (curr_curve_cost > curr_maxdist) { curr_maxdist = curr_curve_cost; @@ -170,7 +175,7 @@ Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, con if (arya) { - auto cost = center_cost_sum(in, centers, distances, eps); + auto cost = center_cost_sum(in, centers, distances, eps, round); auto approxcost = cost; auto gamma = 1/(3 * num_centers * in.size()); auto found = false; @@ -191,7 +196,7 @@ Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, con // swap curr_centers[i] = j; // new cost - auto curr_cost = center_cost_sum(in, curr_centers, distances, eps); + auto curr_cost = center_cost_sum(in, curr_centers, distances, eps, round); // check if improvement is done if (cost - gamma * approxcost > curr_cost) { cost = curr_cost; @@ -206,7 +211,7 @@ Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, con } if (with_assignment) { - result.assignment = getClusterAssignment(in, centers, distances, eps); + result.assignment = cluster_assignment(in, centers, distances, eps, round); } auto end = boost::chrono::process_real_cpu_clock::now(); @@ -216,11 +221,12 @@ Clustering_Result gonzalez(const curve_size_t num_centers, const Curves &in, con return result; } -Clustering_Result arya(const curve_size_t num_centers, const Curves &in, const distance_t eps, const bool with_assignment = false) { - return gonzalez(num_centers, in, eps, true, with_assignment); +Clustering_Result arya(const curve_size_t num_centers, const Curves &in, const distance_t eps, const bool round = true, const bool with_assignment = false) { + return gonzalez(num_centers, in, eps, round, true, with_assignment); } -Clustering_Result one_median_sampling(const double epsilon, const Curves &in, const distance_t eps, const bool with_assignment = false) { +Clustering_Result one_median_sampling(const double epsilon, const Curves &in, const distance_t eps, + const bool round = true, const bool with_assignment = false) { const auto start = boost::chrono::process_real_cpu_clock::now(); Clustering_Result result; Centers centers; @@ -230,7 +236,7 @@ Clustering_Result one_median_sampling(const double epsilon, const Curves &in, co const auto s = std::ceil(60); const auto t = std::ceil(std::log(60)/(epsilon*epsilon)); - Uniform_Random_Generator ugen; + Random::Uniform_Random_Generator ugen; const auto candidates = ugen.get(s); const auto witnesses = ugen.get(t); @@ -244,12 +250,12 @@ Clustering_Result one_median_sampling(const double epsilon, const Curves &in, co for (curve_size_t i = 0; i < candidates.size(); ++i) { const curve_size_t candidate = std::floor(candidates[i] * n); - double objective = 0; + distance_t objective = 0; for (curve_size_t j = 0; j < witnesses.size(); ++j) { const curve_size_t witness = std::floor(witnesses[j] * n); - cheap_dist(candidate, witness, in, distances, eps); + cheap_dist(candidate, witness, in, distances, eps, round); objective += distances[candidate][witness]; } @@ -261,17 +267,17 @@ Clustering_Result one_median_sampling(const double epsilon, const Curves &in, co centers.push_back(best_candidate); if (with_assignment) { - result.assignment = getClusterAssignment(in, centers, distances, eps); + result.assignment = cluster_assignment(in, centers, distances, eps, round); } auto end = boost::chrono::process_real_cpu_clock::now(); result.centers = centers; - result.value = center_cost_sum(in, centers, distances, eps); + result.value = center_cost_sum(in, centers, distances, eps, round); result.running_time = (end-start).count() / 1000000000.0; return result; } -Clustering_Result one_median_exhaustive(const Curves &in, const distance_t eps, const bool with_assignment = false) { +Clustering_Result one_median_exhaustive(const Curves &in, const distance_t eps, const bool round = true, const bool with_assignment = false) { const auto start = boost::chrono::process_real_cpu_clock::now(); Clustering_Result result; Centers centers; @@ -286,10 +292,10 @@ Clustering_Result one_median_exhaustive(const Curves &in, const distance_t eps, for (curve_size_t i = 0; i < in.size(); ++i) { - double objective = 0; + distance_t objective = 0; for (curve_size_t j = 0; j < in.size(); ++j) { - cheap_dist(i, j, in, distances, eps); + cheap_dist(i, j, in, distances, eps, round); objective += distances[i][j]; } @@ -301,7 +307,7 @@ Clustering_Result one_median_exhaustive(const Curves &in, const distance_t eps, centers.push_back(best_candidate); if (with_assignment) { - result.assignment = getClusterAssignment(in, centers, distances, eps); + result.assignment = cluster_assignment(in, centers, distances, eps, round); } auto end = boost::chrono::process_real_cpu_clock::now(); diff --git a/include/coreset.hpp b/include/coreset.hpp new file mode 100644 index 0000000..f881875 --- /dev/null +++ b/include/coreset.hpp @@ -0,0 +1,96 @@ +/* +Copyright 2020 Dennis Rohde + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without result.riction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPresult. OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ +#pragma once + +#include +#include + +#include "types.hpp" +#include "clustering.hpp" +#include "frechet.hpp" + +namespace np = boost::python::numpy; +namespace p = boost::python; + +namespace Coreset { + +class Onemedian_Coreset { + + std::vector coreset; + std::vector lambda; + const distance_t Lambda = 76; + distance_t cost; + +public: + Onemedian_Coreset() {} + + inline Onemedian_Coreset(const Curves &in, const distance_t epsilon, const double eps, const bool round = true, const double constant = 1) { + compute(in, epsilon, eps, round, constant); + } + + inline void compute(const Curves &in, const distance_t epsilon, const double eps, const bool round = true, const double constant = 1) { + const auto n = in.size(); + const auto m = in.get_m(); + const auto c_approx = Clustering::arya(1, in, eps, false); + const auto center = c_approx.centers[0]; + cost = c_approx.value; + if (cost == 0) { + std::cerr << "WARNING: cost is zero, coreset construction not possible - check your input" << std::endl; + return; + } + std::vector probabilities(n); + lambda = std::vector(n); + + for (curve_size_t i = 0; i < n; ++i) { + lambda[i] = 52.0 / n + 24.0 / cost * Frechet::Continuous::distance(in[i], in[center], eps, round).value; + probabilities[i] = (lambda[i]) / Lambda; + } + + auto prob_gen = Random::Custom_Probability_Generator(probabilities); + const std::size_t ssize = std::ceil(constant * 1/epsilon * 1/epsilon * std::log(m)); + const auto coreset_ind = prob_gen.get(ssize); + for (curve_size_t i = 0; i < ssize; ++i) { + coreset.push_back(coreset_ind[i]); + } + } + + inline np::ndarray get_lambda() const { + np::dtype dt = np::dtype::get_builtin(); + p::list l; + np::ndarray result = np::array(l, dt); + for (const auto &elem: lambda) { + l.append(elem); + } + result = np::array(l, dt); + return result; + } + + inline distance_t get_Lambda() const { + return Lambda; + } + + inline np::ndarray get_curves() const { + np::dtype dt = np::dtype::get_builtin(); + p::list l; + np::ndarray result = np::array(l, dt); + for (const auto &elem: coreset) { + l.append(elem); + } + result = np::array(l, dt); + return result; + } + + inline distance_t get_cost() const { + return cost; + } + +}; + +}; diff --git a/include/curve.hpp b/include/curve.hpp index d62f43c..6a1d654 100644 --- a/include/curve.hpp +++ b/include/curve.hpp @@ -11,9 +11,11 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #pragma once #include +#include +#include -#include #include +#include #include "types.hpp" #include "point.hpp" @@ -23,13 +25,25 @@ namespace np = boost::python::numpy; namespace p = boost::python; class Curve { + dimensions_t number_dimensions; + Points points; + public: typedef curve_size_t index_t; - Curve(const dimensions_t dimensions) : number_dimensions{dimensions} {} + Curve() {} + Curve(const curve_size_t m, const dimensions_t dimensions) : points{m}, number_dimensions{dimensions} {} Curve(const Points &points, const dimensions_t dimensions); Curve(const np::ndarray &in); + inline Point operator[](const index_t i) const { + return points[i]; + } + + inline Point& operator[](const index_t i) { + return points[i]; + } + inline curve_size_t size() const { return points.size(); } @@ -42,10 +56,6 @@ class Curve { return number_dimensions; } - inline const Point& operator[](const index_t i) const { - return points[i]; - } - inline const Point& front() const { return points.front(); } @@ -54,10 +64,6 @@ class Curve { return points.back(); } - inline void push_back(const Point &point) { - points.push_back(point); - } - inline Points::iterator begin() { return points.begin(); } @@ -74,16 +80,35 @@ class Curve { return points.cend(); } -private: - const dimensions_t number_dimensions; - Points points; + std::string str() const; }; class Curves : public std::vector { + curve_size_t m; + public: + Curves() {} + Curves(const curve_size_t n, const curve_size_t m) : std::vector{n}, m{m} {} + + inline void add(Curve &curve) { + push_back(curve); + if (curve.size() > m) m = curve.size(); + } + inline const auto get(curve_size_t i) const { return const_cast(this)->operator[](i); } + + inline curve_size_t get_m() const { + return m; + } + + inline curve_size_t number() const { + return size(); + } + + std::string str() const; }; -std::ostream& operator<<(std::ostream& out, const Curve& curve); +std::ostream& operator<<(std::ostream& out, const Curve&); +std::ostream& operator<<(std::ostream& out, const Curves&); diff --git a/include/frechet.hpp b/include/frechet.hpp index f4457ab..624c022 100644 --- a/include/frechet.hpp +++ b/include/frechet.hpp @@ -27,16 +27,16 @@ namespace Continuous { std::size_t number_searches; }; - auto distance(const Curve&, const Curve&, const distance_t) -> Result; + Result distance(const Curve&, const Curve&, const distance_t, const bool); - auto _greedyUpperBound(const Curve&, const Curve&) -> distance_t; - - auto _distance(const Curve&, const Curve&, const distance_t, const distance_t, - const distance_t = std::numeric_limits::epsilon()) -> Result; + Result _distance(const Curve&, const Curve&, distance_t, distance_t, + const distance_t = std::numeric_limits::epsilon()); - bool _lessThan(const distance_t, const Curve&, const Curve&, + bool _less_than_or_equal(const distance_t, const Curve&, const Curve&, std::vector>&, std::vector>&, std::vector>&, std::vector>&); + + distance_t _greedy_upper_bound(const Curve&, const Curve&); } namespace Discrete { diff --git a/include/interval.hpp b/include/interval.hpp index dc0c6db..2a6e83b 100644 --- a/include/interval.hpp +++ b/include/interval.hpp @@ -19,9 +19,9 @@ class Interval { coordinate_t beg, en; public: - Interval() : beg(1), en(0) {} + Interval() : beg{1}, en{0} {} - Interval(coordinate_t begin, coordinate_t end) : beg(begin), en(end) {} + Interval(const coordinate_t begin, const coordinate_t end) : beg{begin}, en{end} {} inline bool operator<(const Interval &other) const { return (beg < other.begin()) or ((beg == other.begin()) and (en < other.end())); diff --git a/include/jl_transform.hpp b/include/jl_transform.hpp index 3a0954b..45c125c 100644 --- a/include/jl_transform.hpp +++ b/include/jl_transform.hpp @@ -20,7 +20,7 @@ namespace JLTransform { if (in.empty()) return in; - auto rg = Gauss_Random_Generator(0, 1); + auto rg = Random::Gauss_Random_Generator(0, 1); auto number_points = 0; for (auto &elem: in) number_points += elem.size(); @@ -38,13 +38,13 @@ namespace JLTransform { for (auto &elem: mat) elem = rg.get(in[0].dimensions()); - Curves result; + Curves result(in.size(), in.get_m()); auto sqrtk = std::sqrt(new_number_dimensions); #pragma omp parallel for for (curve_size_t l = 0; l < in.size(); ++l) { - result.push_back(Curve(new_number_dimensions)); + result[l] = Curve(in[l].size(), in[l].dimensions()); for (curve_size_t i = 0; i < in[l].size(); ++i) { std::vector new_coords(new_number_dimensions); @@ -61,7 +61,7 @@ namespace JLTransform { } Point new_point(new_coords); - result[l].push_back(new_point); + result[l][i] = new_point; } #if DEBUG diff --git a/include/point.hpp b/include/point.hpp index c5937ef..d39ad71 100644 --- a/include/point.hpp +++ b/include/point.hpp @@ -13,6 +13,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPresult. OR IM #include #include #include +#include +#include #include "types.hpp" #include "interval.hpp" @@ -22,36 +24,35 @@ class Point { public: inline Point() {} - inline Point(const std::vector &in) : coordinates{in} {} - inline std::vector& getCoordinates() { - return this->coordinates; + inline dimensions_t size() const { + return coordinates.size(); } - inline const std::vector& getCoordinates() const { - return this->coordinates; + inline coordinate_t operator[](const dimensions_t i) const { + return coordinates[i]; } - inline coordinate_t operator[](dimensions_t i) const { - return coordinates[i]; + inline coordinate_t& operator[](const dimensions_t i) { + return coordinates[i]; } inline Point& operator+=(const Point &point) { for (dimensions_t i = 0; i < coordinates.size(); ++i){ - coordinates[i] += point.getCoordinates()[i]; + coordinates[i] += point[i]; } return *this; } inline Point& operator-=(const Point &point) { for (dimensions_t i = 0; i < coordinates.size(); ++i){ - coordinates[i] -= point.getCoordinates()[i]; + coordinates[i] -= point[i]; } return *this; } - inline Point& operator/=(distance_t distance) { + inline Point& operator/=(const distance_t distance) { for (dimensions_t i = 0; i < coordinates.size(); ++i){ coordinates[i] /= distance; } @@ -61,7 +62,7 @@ class Point { inline Point operator+(const Point &point) const { auto result = *this; for (dimensions_t i = 0; i < coordinates.size(); ++i){ - result.coordinates[i] += point.getCoordinates()[i]; + result.coordinates[i] += point[i]; } return result; @@ -70,17 +71,16 @@ class Point { inline Point operator-(const Point &point) const { auto result = *this; for (dimensions_t i = 0; i < coordinates.size(); ++i){ - result.coordinates[i] -= point.getCoordinates()[i]; + result.coordinates[i] -= point[i]; } return result; } inline Point operator*(const distance_t mult) const { - Point result; - result.coordinates = std::vector (coordinates.size()); + Point result = *this; for (dimensions_t i = 0; i < coordinates.size(); ++i){ - result.getCoordinates()[i] = coordinates[i] * mult; + result[i] = result[i] * mult; } return result; } @@ -88,25 +88,23 @@ class Point { inline distance_t operator*(const Point &p) const { distance_t result = 0; for (dimensions_t i = 0; i < coordinates.size(); ++i) { - result += coordinates[i] * p.getCoordinates()[i]; + result += coordinates[i] * p[i]; } return result; } - inline Point operator/(distance_t dist) { - Point result; - result.coordinates = std::vector(coordinates.size()); + inline Point operator/(const distance_t dist) const { + Point result = *this; for (dimensions_t i = 0; i < coordinates.size(); ++i){ - result.getCoordinates()[i] = coordinates[i] / dist; + result[i] = result[i] / dist; } return result; } inline distance_t dist_sqr(const Point &point) const { - distance_t result = 0; - distance_t temp; + distance_t result = 0, temp; for (dimensions_t i = 0; i < coordinates.size(); ++i){ - temp = coordinates[i] - point.getCoordinates()[i]; + temp = coordinates[i] - point[i]; result += temp * temp; } return result; @@ -147,6 +145,7 @@ class Point { return Interval(std::max(0., lambda1), std::min(1., lambda2)); } + std::string str() const; }; diff --git a/include/random.hpp b/include/random.hpp index 2eeccb8..85a4595 100644 --- a/include/random.hpp +++ b/include/random.hpp @@ -11,6 +11,9 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #pragma once #include +#include + +namespace Random { template class Uniform_Random_Generator { @@ -25,10 +28,10 @@ class Uniform_Random_Generator { return distribution(mersenne_twister); } - inline std::vector get(const unsigned long n) { - std::vector results; - for (auto i = n; i > 0; --i) { - results.push_back(get()); + inline std::vector get(const std::size_t n) { + std::vector results(n); + for (std::size_t i = 0; i < n; ++i) { + results[i] = get(); } return results; } @@ -48,32 +51,45 @@ class Gauss_Random_Generator { return distribution(mersenne_twister); } - inline std::vector get(const unsigned long n) { - std::vector results; - for (auto i = n; i > 0; --i) { - results.push_back(get()); + inline std::vector get(const std::size_t n) { + std::vector results(n); + for (std::size_t i = 0; i < n; ++i) { + results[i] = get(); } return results; } }; -/*template -class Rademacher_Random_Generator { - std::mt19937_64 mersenne_twister; - std::discrete_distribution distribution; - +template +class Custom_Probability_Generator { + Uniform_Random_Generator uform_gen; + std::vector cumulative_probabilities; + public: - Rademacher_Random_Generator() : mersenne_twister{std::random_device{}()}, distribution{-1/2, 1/2} {} + Custom_Probability_Generator(const std::vector &probabilities) : uform_gen{0, 1} { + if (probabilities.empty()) return; + cumulative_probabilities = std::vector(probabilities.size()); + cumulative_probabilities[0] = probabilities[0]; + for (std::size_t i = 1; i < probabilities.size(); ++i) + cumulative_probabilities[i] = probabilities[i] + cumulative_probabilities[i-1]; + } - inline double get() { - return distribution(mersenne_twister); + inline T get() { + const auto n = cumulative_probabilities.size(); + const auto r = uform_gen.get(); + const auto upper = std::upper_bound(cumulative_probabilities.cbegin(), cumulative_probabilities.cend(), r); + assert(upper != cumulative_probabilities.cend()); + const auto result = std::distance(cumulative_probabilities.cbegin(), upper); + return result; } - inline std::vector get(const unsigned long n) { - std::vector results; - for (auto i = n; i > 0; --i) { - results.push_back(get()); + inline std::vector get(const std::size_t n) { + std::vector results(n); + for (std::size_t i = 0; i < n; ++i) { + results[i] = get(); } return results; } -};*/ +}; + +}; diff --git a/include/types.hpp b/include/types.hpp index 2dc24df..0da733f 100644 --- a/include/types.hpp +++ b/include/types.hpp @@ -10,7 +10,6 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #pragma once - typedef double distance_t; typedef double coordinate_t; diff --git a/py/Fred/__init__.py b/py/Fred/__init__.py new file mode 100644 index 0000000..baacd25 --- /dev/null +++ b/py/Fred/__init__.py @@ -0,0 +1 @@ +from . import backend diff --git a/py/Fred/hd_fs.py b/py/Fred/hd_fs.py new file mode 100644 index 0000000..0590767 --- /dev/null +++ b/py/Fred/hd_fs.py @@ -0,0 +1,120 @@ +from . import backend + +from cvxopt import matrix, solvers +solvers.options['show_progress'] = False +import numpy as np + +def curve_within_radii(T, radii): + l = len(T) + start_point = _intersection_point(T, radii, [1] * l, [0] * l) + if not start_point[0]: + return (False, ) + end_point = _intersection_point(T, radii, [T[i].points - 1 for i in range(0, l)], [1] * l) + if not end_point[0]: + return (False, ) + borders = [([1] * len(T), start_point[1]), ([T[i].points for i in range(0, l)], end_point[1])] + _compute_borders(T, radii, np.array([1] * l, dtype=np.uint64), borders) + borders = sorted(borders, key=lambda border: border[0]) + solution = list() + solution.append(borders[-1]) + alpha = [T[i].points for i in range(0, l)] + for i in range(len(borders)-2, -1, -1): + diff = np.array(alpha) - np.array(borders[i][0]) + bools = any(diff > 1) or any(np.array(borders[i][0]) > np.array(alpha)) or all(np.array(borders[i][0]) == np.array(alpha)) + if not bools: + solution.append(borders[i]) + alpha = borders[i][0] + if solution[-1] != borders[0]: + return (False, ) + solution = reversed(solution) + return (True, backend.Curve(np.array([sol[1] for sol in solution], dtype="double"))) + +def _compute_borders(T, radii, alpha, borders): + for j in range(0, len(T)): + if alpha[j] + 1 < T[j].points: + border = _cellborder(T, radii, alpha, j, 1) + if border[0]: + mask = np.zeros((len(T), )) + mask[j] = 1 + borders.append(([alpha[i] + border[1][0][i] for i in range(0, len(alpha))], border[1][1])) + _compute_borders(T, radii, alpha + mask, borders) + +def _intersection_point(T, radii, alpha, values): + lines = list() + for i in range(0, len(T)): + lines.append([(1-values[i])*T[i][alpha[i]-1][k] + values[i]*T[i][alpha[i]][k] for k in range(0, T[i].dimensions)]) + + c = matrix([0.] * T[0].dimensions) + G = [] + for i in range(0, T[0].dimensions): + m = len(T) * ([0.] + i * [0.] + [-1] + (T[0].dimensions - i - 1) * [0.]) + G.append(m) + + G = matrix(G) + h = [] + for i in range(0, len(T)): + h += [radii[i]] + [-lines[i][j] for j in range(0, T[i].dimensions)] + h = matrix(h) + dims = {'l': 0, 'q': [1+T[0].dimensions] * len(T), 's': [0]} + sol = solvers.conelp(c, G, h, dims) + return sol['status'] == 'optimal', np.array([s for s in sol['x']]) if sol['x'] is not None else None + +def _cellborder(T, radii, alpha, j, value): + lines = list() + for i in range(0, len(T)): + if (i == j): + lines.append(([(1-value)*T[i][int(alpha[i]-1)][k] + value*T[i][int(alpha[i])][k] for k in range(0, T[i].dimensions)],)) + else: + lines.append(([T[i][int(alpha[i]-1)][k] - T[i][int(alpha[i])][k] for k in range(0, T[i].dimensions)], + [T[i][int(alpha[i]-1)][k] for k in range(0, T[i].dimensions)])) + + dims = {'l': 2 * (len(T)-1), 'q': [1+T[0].dimensions] * len(T), 's': [0]} + + c = matrix([0.] * (len(T) - 1) + [0.] * T[0].dimensions) + G = list() + + for i in range(0, len(T)): + if i == j: + pass + elif i < j: + m = i * [0.] + [1.] + (len(T) - i - 2) * [0.] + m += i * [0.] + [-1.] + (len(T) - i - 2) * [0.] + m += (i * (T[i].dimensions + 1)) * [0.] + [0.] + [-lines[i][0][k] for k in range(0, T[i].dimensions)] + ((len(T) - i - 1) * (T[i].dimensions + 1)) * [0.] + G.append(m) + else: + m = (i - 1) * [0.] + [1.] + (len(T) - i - 1) * [0.] + m += (i - 1) * [0.] + [-1.] + (len(T) - i - 1) * [0.] + m += (i * (T[i].dimensions + 1)) * [0.] + [0.] + [-lines[i][0][k] for k in range(0, T[i].dimensions)] + ((len(T) - i - 1) * (T[i].dimensions + 1)) * [0.] + G.append(m) + + for i in range(0, T[0].dimensions): + m = (2 * (len(T) - 1)) * [0.] + m += len(T) * ([0.] + i * [0.] + [-1] + (T[0].dimensions - i - 1) * [0.]) + G.append(m) + + G = matrix(G) + + h = [1.] * (len(T) - 1) + h += [0.] * (len(T) - 1) + for i in range(0, len(T)): + h += [radii[i]] + if i == j: + h += [-lines[i][0][k] for k in range(0, T[i].dimensions)] + else: + h += [-lines[i][1][k] for k in range(0, T[i].dimensions)] + h = matrix(h) + try: + sol = solvers.conelp(c, G, h, dims) + except Exception as e: + print(e) + if sol['x'] is not None: + x = sol['x'] + y = np.empty((len(T),)) + for i in range(0, len(T)): + if i < j: + y[i] = x[i] + elif i == j: + y[j] = value + else: + y[i] = x[i-1] + return sol['status'] == 'optimal', (y, np.array([s for s in sol['x'][len(T)-1:]])) if sol['x'] is not None else None diff --git a/py/Fred/median.py b/py/Fred/median.py new file mode 100644 index 0000000..4bc20ba --- /dev/null +++ b/py/Fred/median.py @@ -0,0 +1,47 @@ +from . import backend +from .hd_fs import curve_within_radii + +import numpy as np +import itertools +import progressbar + +def frechet_median(T, epsilon): + coreset = backend.onemedian_coreset(T, epsilon/2.) + cost = coreset.cost + lower = cost/6 + lambd = coreset.lambd + Lambd = coreset.Lambd + curves = list(coreset.curves()) + curves_dist = list(set(curves)) + coreset_dist = backend.Curves() + coreset_dist_multiplicities = [curves.count(elem) for elem in curves_dist] + for curve_ind in curves_dist: + coreset_dist.add(T[int(curve_ind)]) + radiis = list() + length = 1 + for i in range(0, len(curves_dist)): + range_ = np.linspace(0, cost, num=np.ceil(cost/(epsilon*cost/24*lambd[curves_dist[i]]/Lambd)), endpoint=True) + length *= len(range_) + radiis.append(range_) + candidates = list() + count = 0 + with progressbar.ProgressBar(max_value=length) as bar: + for radii in itertools.product(*radiis): + count += 1 + bar.update(count) + rsum = np.sum(radii) + if rsum < lower or rsum > cost: + continue + candidate = curve_within_radii(coreset_dist, radii) + if candidate[0]: + candidates.append(candidate[1]) + min_cost_curve = 0 + min_cost = np.infty + for i in progressbar.progressbar(range(0, len(candidates))): + cost = 0 + for j in range(0, len(coreset_dist)): + cost += coreset_dist_multiplicities[j] * Lambd/lambd[curves_dist[j]] * 1/len(curves) * backend.continuous_frechet(candidates[i], coreset_dist[j]).value + if cost < min_cost: + min_cost_curve = i + min_cost = cost + return candidates[min_cost_curve] diff --git a/py/setup.py b/py/setup.py index cac2bf8..c4077f2 100644 --- a/py/setup.py +++ b/py/setup.py @@ -8,6 +8,7 @@ from distutils.version import LooseVersion from setuptools import setup, Extension from setuptools.command.build_ext import build_ext +import setuptools class CMakeExtension(Extension): @@ -74,19 +75,18 @@ def build_extension(self, ext): cwd=self.build_temp, env=env) subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) - print() # Add an empty line for cleaner output - setup( name='Fred', version='1.0', author='Dennis Rohde', author_email='dennis.rohde@tu-dortmund.de', - description='Frechet Distance Library', + description='Frechet Distance and Clustering Library', long_description='', - # add extension module - ext_modules=[CMakeExtension('Fred')], - # add custom build_ext command + packages=setuptools.find_packages(), + ext_package="Fred", + ext_modules=[CMakeExtension('backend')], + install_requires=['cvxopt'], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, ) diff --git a/py/test.py b/py/test.py deleted file mode 100644 index 1aa0a13..0000000 --- a/py/test.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import unittest -import Fred - -eps = 0.00001 - -class TestContinuousFrechet(unittest.TestCase): - - def test_zigzag1d(self): - a = Fred.Curve(np.array([[0.0], [1.0], [0.0], [1.0]])) - b = Fred.Curve(np.array([[0.0],[0.75], [0.25], [1.0]])) - c = Fred.Curve(np.array([[0.0],[1.0]])) - self.assertEqual(Fred.continuous_frechet(a, b, eps).value, 0.25) - self.assertEqual(Fred.continuous_frechet(a, c, eps).value, 0.5) - - def test_longsegment(self): - a = Fred.Curve(np.array([[0.0, 0.0],[500.0e3, 0.0], [1.0e6, 0.0]])) - b = Fred.Curve(np.array([[0.0, 0.0],[1.0e6, 0.0]])) - self.assertEqual(Fred.continuous_frechet(a, b, eps).value, 0.0) - -class TestDiscreteFrechet(unittest.TestCase): - - def test_zigzag1d(self): - a = Fred.Curve(np.array([[0.0], [1.0], [0.0], [1.0]])) - b = Fred.Curve(np.array([[0.0],[0.75], [0.25], [1.0]])) - c = Fred.Curve(np.array([[0.0],[1.0]])) - self.assertEqual(Fred.discrete_frechet(a, b).value, 0.25) - self.assertEqual(Fred.discrete_frechet(a, c).value, 1.0) - - def test_longsegment(self): - a = Fred.Curve(np.array([[0.0, 0.0],[500.0e3, 0.0], [1.0e6, 0.0]])) - b = Fred.Curve(np.array([[0.0, 0.0],[1.0e6, 0.0]])) - self.assertEqual(Fred.discrete_frechet(a, b).value, 500000.0) - -if __name__ == '__main__': - unittest.main() diff --git a/py/test/test.py b/py/test/test.py new file mode 100644 index 0000000..6cbb0eb --- /dev/null +++ b/py/test/test.py @@ -0,0 +1,34 @@ +import numpy as np +import unittest +import Fred.backend as fred + +class TestContinuousFrechet(unittest.TestCase): + + def test_zigzag1d(self): + a = fred.Curve(np.array([0.0, 1.0, 0.0, 1.0])) + b = fred.Curve(np.array([0.0, 0.75, 0.25, 1.0])) + c = fred.Curve(np.array([0.0, 1.0])) + self.assertEqual(fred.continuous_frechet(a, b).value, 0.25) + self.assertEqual(fred.continuous_frechet(a, c).value, 0.5) + + def test_longsegment(self): + a = fred.Curve(np.array([0.0,500.0e3, 1.0e6])) + b = fred.Curve(np.array([0.0, 1.0e6])) + self.assertEqual(fred.continuous_frechet(a, b).value, 0.0) + +class TestDiscreteFrechet(unittest.TestCase): + + def test_zigzag1d(self): + a = fred.Curve(np.array([0.0, 1.0, 0.0, 1.0])) + b = fred.Curve(np.array([0.0, 0.75, 0.25, 1.0])) + c = fred.Curve(np.array([0.0, 1.0])) + self.assertEqual(fred.discrete_frechet(a, b).value, 0.25) + self.assertEqual(fred.discrete_frechet(a, c).value, 1.0) + + def test_longsegment(self): + a = fred.Curve(np.array([0.0,500.0e3, 1.0e6])) + b = fred.Curve(np.array([0.0, 1.0e6])) + self.assertEqual(fred.discrete_frechet(a, b).value, 500000.0) + +if __name__ == '__main__': + unittest.main() diff --git a/src/curve.cpp b/src/curve.cpp index 3b41415..5c9e734 100644 --- a/src/curve.cpp +++ b/src/curve.cpp @@ -22,51 +22,97 @@ Curve::Curve(const Points &points, const dimensions_t dimensions) : #endif } -Curve::Curve(const np::ndarray &in): number_dimensions(in.shape(1)) { - auto dimensions = in.get_nd(); - if (dimensions != 2 or in.get_dtype() != np::dtype::get_builtin()){ - std::cerr << "A Polygonal_Curve requires an 2-dimensional numpy array of type double."<< std::endl; - std::cerr << "Current dimensiont: " << dimensions << std::endl; +Curve::Curve(const np::ndarray &in): number_dimensions(in.get_nd()) { + if (number_dimensions > 2){ + std::cerr << "A Curve requires a 1- or 2-dimensional numpy array of type double."<< std::endl; + std::cerr << "Current dimensions: " << number_dimensions << std::endl; + std::cerr << "Current type: " << p::extract(p::str(in.get_dtype())) << std::endl; + std::cerr << "WARNING: constructed empty curve" << std::endl; + return; + } + if (in.get_dtype() != np::dtype::get_builtin()) { + std::cerr << "A Polygonal_Curve requires a 1- or 2-dimensional numpy array of type double."<< std::endl; + std::cerr << "Current dimensions: " << number_dimensions << std::endl; + std::cerr << "Current type: " << p::extract(p::str(in.get_dtype())) << std::endl; + std::cerr << "WARNING: constructed empty curve" << std::endl; return; } auto number_points = in.shape(0); - auto point_size = in.shape(1); - - #if DEBUG - std::cout << "constructing curve of size " << number_points << " and " << point_size << " dimensions" << std::endl; - #endif - - auto strides0 = in.strides(0) / sizeof(coordinate_t); - auto strides1 = in.strides(1) / sizeof(coordinate_t); - - auto data = reinterpret_cast(in.get_data()); - - points = Points(number_points); - - for (index_t i = 0; i < number_points; ++i, data += strides0) { - points[i] = Point(std::vector (point_size)); + if (number_dimensions == 2) { + auto point_size = in.shape(1); + + #if DEBUG + std::cout << "constructing curve of size " << number_points << " and " << point_size << " dimensions" << std::endl; + #endif + + auto strides0 = in.strides(0) / sizeof(coordinate_t); + auto strides1 = in.strides(1) / sizeof(coordinate_t); + + auto data = reinterpret_cast(in.get_data()); - auto coord_data = data; + points = Points(number_points); - for(index_t j = 0; j < point_size; ++j, coord_data += strides1){ - points[i].getCoordinates()[j] = *coord_data; + for (index_t i = 0; i < number_points; ++i, data += strides0) { + points[i] = Point(std::vector(point_size)); + + auto coord_data = data; + + for(index_t j = 0; j < point_size; ++j, coord_data += strides1){ + points[i][j] = *coord_data; + } + } + } else { + auto strides0 = in.strides(0) / sizeof(coordinate_t); + + auto data = reinterpret_cast(in.get_data()); + + points = Points(number_points); + + for (index_t i = 0; i < number_points; ++i, data += strides0) { + points[i] = Point(std::vector(1)); + + points[i][0] = *data; } } if (points.empty()) { - std::cerr << "warning: constructed empty curve" << std::endl; + std::cerr << "WARNING: constructed empty curve" << std::endl; return; } } -std::ostream& operator<<(std::ostream &out, const Curve &curve) -{ +std::string Curve::str() const { + std::stringstream ss; + ss << *this; + return ss.str(); +} + +std::string Curves::str() const { + std::stringstream ss; + ss << *this; + return ss.str(); +} + +std::ostream& operator<<(std::ostream &out, const Curve &curve) { out << "["; - for (const auto &point: curve) { - out << point << ", "; + + for (curve_size_t i = 0; i < curve.size() - 1; ++i) { + out << curve[i] << ", "; } - out << "]" << std::endl; + + out << curve[curve.size() -1] << "]"; return out; } +std::ostream& operator<<(std::ostream &out, const Curves &curves) { + out << "{"; + + for (curve_size_t i = 0; i < curves.size() - 1; ++i) { + out << curves[i] << ", "; + } + + out << curves[curves.size() -1] << "}"; + + return out; +} diff --git a/src/frechet.cpp b/src/frechet.cpp index 87f37ac..100f601 100644 --- a/src/frechet.cpp +++ b/src/frechet.cpp @@ -19,12 +19,17 @@ namespace Frechet { namespace Continuous { -auto distance(const Curve &curve1, const Curve &curve2, const distance_t eps) -> Result { - distance_t lb, ub; +auto distance(const Curve &curve1, const Curve &curve2, const distance_t eps, const bool round) -> Result { + if ((curve1.size() < 2) or (curve2.size() < 2)) { + std::cerr << "WARNING: comparison possible only for curves of at least two points" << std::endl; + Result result; + result.value = std::numeric_limits::signaling_NaN(); + return result; + } auto start = boost::chrono::process_real_cpu_clock::now(); - lb = std::sqrt(std::max(curve1[0].dist_sqr(curve2[0]), curve1[curve1.size()-1].dist_sqr(curve2[curve2.size()-1]))); - ub = _greedyUpperBound(curve1, curve2); + const auto lb = std::sqrt(std::max(curve1[0].dist_sqr(curve2[0]), curve1[curve1.size()-1].dist_sqr(curve2[curve2.size()-1]))); + const auto ub = _greedy_upper_bound(curve1, curve2); auto end = boost::chrono::process_real_cpu_clock::now(); #if DEBUG @@ -33,14 +38,15 @@ auto distance(const Curve &curve1, const Curve &curve2, const distance_t eps) -> auto dist = _distance(curve1, curve2, ub, lb, eps); dist.time_bounds = (end-start).count() / 1000000000.0; + if (round) dist.value = std::round(dist.value * 1e3) / 1e3; return dist; } -auto _greedyUpperBound(const Curve &curve1, const Curve &curve2) -> distance_t { +auto _greedy_upper_bound(const Curve &curve1, const Curve &curve2) -> distance_t { distance_t result = 0; - curve_size_t len1 = curve1.size(), len2 = curve2.size(); + const curve_size_t len1 = curve1.size(), len2 = curve2.size(); curve_size_t i = 0, j = 0; while ((i < len1 - 1) and (j < len2 - 1)) { @@ -58,9 +64,9 @@ auto _greedyUpperBound(const Curve &curve1, const Curve &curve2) -> distance_t { } } - for(; i < len1 - 1; ++i) result = std::max(result, curve1[i].dist_sqr(curve2[j])); - - for(; j < len2 - 1; ++j) result = std::max(result, curve1[i].dist_sqr(curve2[j])); + while (i < len1) result = std::max(result, curve1[i++].dist_sqr(curve2[j])); + --i; + while (j < len2) result = std::max(result, curve1[i].dist_sqr(curve2[j++])); return std::sqrt(result); } @@ -84,7 +90,7 @@ auto _distance(const Curve &curve1, const Curve &curve2, distance_t ub, distance while (ub - lb > eps) { ++number_searches; split = (ub + lb)/2; - auto isLessThan = _lessThan(split, curve1, curve2, reachable1, reachable2, free_intervals1, free_intervals2); + auto isLessThan = _less_than_or_equal(split, curve1, curve2, reachable1, reachable2, free_intervals1, free_intervals2); if (isLessThan) { ub = split; } @@ -97,7 +103,7 @@ auto _distance(const Curve &curve1, const Curve &curve2, distance_t ub, distance } } - distance_t value = std::round((ub + lb)/2. * 1e3) / 1e3; + distance_t value = (ub + lb)/2.; auto end = boost::chrono::process_real_cpu_clock::now(); result.value = value; result.time_searches = (end-start).count() / 1000000000.0; @@ -105,7 +111,7 @@ auto _distance(const Curve &curve1, const Curve &curve2, distance_t ub, distance return result; } -bool _lessThan(const distance_t distance, Curve const& curve1, Curve const& curve2, +bool _less_than_or_equal(const distance_t distance, Curve const& curve1, Curve const& curve2, std::vector> &reachable1, std::vector> &reachable2, std::vector> &free_intervals1, std::vector> &free_intervals2) { assert(curve1.size() >= 2); diff --git a/src/fred_python_wrapper.cpp b/src/fred_python_wrapper.cpp index 61f696e..2a572f4 100644 --- a/src/fred_python_wrapper.cpp +++ b/src/fred_python_wrapper.cpp @@ -10,10 +10,12 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include -#include "frechet.hpp" #include "curve.hpp" +#include "point.hpp" +#include "frechet.hpp" #include "jl_transform.hpp" #include "clustering.hpp" +#include "coreset.hpp" using namespace boost::python; namespace np = boost::python::numpy; @@ -22,68 +24,100 @@ namespace fd = Frechet::Discrete; distance_t epss = 0.001; -fc::Result continuous_frechet(const Curve &curve1, const Curve &curve2, const distance_t eps) { - return fc::distance(curve1, curve2, eps); +fc::Result continuous_frechet(const Curve &curve1, const Curve &curve2, const distance_t eps = 0.001, const bool round = true) { + return fc::distance(curve1, curve2, eps, round); } +BOOST_PYTHON_FUNCTION_OVERLOADS(continuous_frechet_overloads, continuous_frechet, 2, 4); + fd::Result discrete_frechet(const Curve &curve1, const Curve &curve2) { return fd::distance(curve1, curve2); } -Curves jl_transform(const Curves &in, const double epsilon, const bool empirical_constant) { +Curves jl_transform(const Curves &in, const double epsilon, const bool empirical_constant = true) { Curves curvesrp = JLTransform::transform_naive(in, epsilon, empirical_constant); return curvesrp; } -Clustering::Clustering_Result kcenter(const curve_size_t num_centers, const Curves &in, const distance_t eps, const bool with_assignment = false) { - auto result = Clustering::gonzalez(num_centers, in, eps, false, with_assignment); +BOOST_PYTHON_FUNCTION_OVERLOADS(jl_transform_overloads, jl_transform, 2, 3); + +Clustering::Clustering_Result kcenter(const curve_size_t num_centers, const Curves &in, + const distance_t eps = 0.001, const bool round = true, const bool with_assignment = false) { + auto result = Clustering::gonzalez(num_centers, in, eps, round, false, with_assignment); return result; } -Clustering::Clustering_Result onemedian_sampling(const Curves &in, const double epsilon, const distance_t eps, const bool with_assignment = false) { +BOOST_PYTHON_FUNCTION_OVERLOADS(kcenter_overloads, kcenter, 2, 5); + +Clustering::Clustering_Result onemedian_sampling(const Curves &in, const double epsilon, + const distance_t eps = 0.001, const bool round = true, const bool with_assignment = false) { - auto result = Clustering::one_median_sampling(epsilon, in, eps, with_assignment); + auto result = Clustering::one_median_sampling(epsilon, in, eps, round, with_assignment); return result; } -Clustering::Clustering_Result onemedian_exhaust(const Curves &in, const distance_t eps, const bool with_assignment = false) { +BOOST_PYTHON_FUNCTION_OVERLOADS(onemedian_sampling_overloads, onemedian_sampling, 2, 5); - auto result = Clustering::one_median_exhaustive(in, eps, with_assignment); +Clustering::Clustering_Result onemedian_exhaustive(const Curves &in, + const distance_t eps = 0.001, const bool round = true, const bool with_assignment = false) { + + auto result = Clustering::one_median_exhaustive(in, eps, round, with_assignment); return result; } -Clustering::Clustering_Result kmedian(const curve_size_t num_centers, const Curves &in, const distance_t eps, const bool with_assignment = false) { +BOOST_PYTHON_FUNCTION_OVERLOADS(onemedian_exhaustive_overloads, onemedian_exhaustive, 1, 4); + +Clustering::Clustering_Result kmedian(const curve_size_t num_centers, const Curves &in, + const distance_t eps = 0.001, const bool round = true, const bool with_assignment = false) { - auto result = Clustering::arya(num_centers, in, eps, with_assignment); + auto result = Clustering::arya(num_centers, in, eps, round, with_assignment); return result; } -BOOST_PYTHON_MODULE(Fred) +BOOST_PYTHON_FUNCTION_OVERLOADS(kmedian_overloads, kmedian, 2, 5); + +Coreset::Onemedian_Coreset onemedian_coreset(const Curves &in, const double epsilon, + const double eps = 0.001, const bool round = true, const double constant = 1) { + return Coreset::Onemedian_Coreset(in, epsilon, eps, round, constant); +} + +BOOST_PYTHON_FUNCTION_OVERLOADS(onemedian_coreset_overloads, onemedian_coreset, 2, 5); + +BOOST_PYTHON_MODULE(backend) { Py_Initialize(); np::initialize(); scope().attr("epsilon") = epss; + class_("Point", init<>()) + .def("__len__", &Point::size) + .def("__getitem__", static_cast(&Point::operator[])) + .def("__str__", &Point::str) + ; + class_("Curve", init()) .add_property("dimensions", &Curve::dimensions) .add_property("points", &Curve::size) + .def("__getitem__", static_cast(&Curve::operator[])) .def("__len__", &Curve::size) + .def("__str__", &Curve::str) ; - + class_("Curves", init<>()) - .def("add", static_cast(&Curves::push_back)) + .def("add", &Curves::add) .def("__getitem__", &Curves::get) - .def("__len__", &Curves::size) + .def("__len__", &Curves::number) + .add_property("m", &Curves::get_m) + .def("__str__", &Curves::str) ; - class_("Clustering_Result", init<>()) .def("__getitem__", &Clustering::Clustering_Result::get) .def("__len__", &Clustering::Clustering_Result::size) @@ -95,7 +129,7 @@ BOOST_PYTHON_MODULE(Fred) class_("Clustering_Assignment", init<>()) .def("__len__", &Clustering::Cluster_Assignment::size) .def("count", &Clustering::Cluster_Assignment::count) - .def("__getitem__", &Clustering::Cluster_Assignment::get) + .def("get", &Clustering::Cluster_Assignment::get) ; class_("Continuous_Frechet_Result", init<>()) @@ -110,12 +144,19 @@ BOOST_PYTHON_MODULE(Fred) .add_property("value", &fd::Result::value) ; + class_("Onemedian_coreset") + .add_property("lambd", &Coreset::Onemedian_Coreset::get_lambda) + .add_property("Lambd", &Coreset::Onemedian_Coreset::get_Lambda) + .def("curves", &Coreset::Onemedian_Coreset::get_curves) + .add_property("cost", &Coreset::Onemedian_Coreset::get_cost) + ; - def("dimension_reduction", jl_transform); - def("continuous_frechet", continuous_frechet); + def("dimension_reduction", jl_transform, jl_transform_overloads()); + def("continuous_frechet", continuous_frechet, continuous_frechet_overloads()); def("discrete_frechet", discrete_frechet); - def("discrete_kcenter", kcenter); - def("discrete_kmedian", kmedian); - def("discrete_onemedian_sampling", onemedian_sampling); - def("discrete_onemedian_exhaustive", onemedian_exhaust); + def("discrete_kcenter", kcenter, kcenter_overloads()); + def("discrete_kmedian", kmedian, kmedian_overloads()); + def("discrete_onemedian_sampling", onemedian_sampling, onemedian_sampling_overloads()); + def("discrete_onemedian_exhaustive", onemedian_exhaustive, onemedian_exhaustive_overloads()); + def("onemedian_coreset", onemedian_coreset, onemedian_coreset_overloads()); } diff --git a/src/point.cpp b/src/point.cpp index bce45df..d159418 100644 --- a/src/point.cpp +++ b/src/point.cpp @@ -10,16 +10,20 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include "point.hpp" -std::ostream& operator<<(std::ostream &out, const Point &p) -{ +std::string Point::str() const { + std::stringstream ss; + ss << *this; + return ss.str(); +} + +std::ostream& operator<<(std::ostream &out, const Point &p) { out << "("; - for (dimensions_t i = 0; i < p.getCoordinates().size(); i++){ - out << p.getCoordinates().at(i); - if( i != p.getCoordinates().size() - 1){ - out << ","; - } + + for (dimensions_t i = 0; i < p.size() - 1; ++i){ + out << p[i] << ","; } - out << ")" << std::endl; + + out << p[p.size() - 1] << ")"; return out; }