Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PFD verification #463

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions examples/basic/verifying_pfd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import desbordante

ERROR = 0.2
PER_TUPLE = 'per_tuple'
PER_VALUE = 'per_value'
TABLE = 'examples/datasets/glitchy_sensor_2.csv'


def print_results(pfd_verifier):
error = pfd_verifier.get_error()
if error <= ERROR:
print('PFD holds')
else:
print(f'PFD with error {ERROR} does not hold')
print(f'instead it holds with error {error}')
print(f'Clusters violating PFD ({pfd_verifier.get_num_violating_clusters()}):')
for cluster in pfd_verifier.get_violating_clusters():
print(cluster)
print()


# Loading input data
algo = desbordante.pfd_verification.algorithms.PFDVerifier()
algo.load_data(table=(TABLE, ',', True))

algo.execute(lhs_indices=[1], rhs_indices=[2], error=ERROR, error_measure=PER_VALUE)
print('Checking whether pFD [device_id] -> [data]')
print(f'with error {ERROR} and PerValue error measure holds:')
print_results(algo)

algo.execute(lhs_indices=[1], rhs_indices=[2], error=ERROR, error_measure=PER_TUPLE)
print('Checking whether the same PFD holds for PerTuple error measure:')
print_results(algo)
11 changes: 11 additions & 0 deletions examples/datasets/glitchy_sensor_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Id,DeviceId,Data
1,D-1,1001
2,D-1,1002
3,D-1,1003
4,D-1,1004
5,D-1,1005
6,D-1,1006
7,D-2,1000
8,D-3,1000
9,D-4,1000
10,D-5,1000
7 changes: 5 additions & 2 deletions src/core/algorithms/algorithm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ using AlgorithmTypes =
Apriori, metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC,
PyroUCC, cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, Mind,
Fastod, GfdValidation, EGfdValidation, NaiveGfdValidation, order::Order,
dd::Split>;
dd::Split, PFDVerifier>;

// clang-format off
/* Enumeration of all supported non-pipeline algorithms. If you implement a new
Expand Down Expand Up @@ -76,7 +76,10 @@ BETTER_ENUM(AlgorithmType, char,
order,

/* Differential dependencies mining algorithm */
split
split,

/* PFD verifier algorithm */
pfd_verifier
)
// clang-format on

Expand Down
92 changes: 92 additions & 0 deletions src/core/algorithms/fd/pfd_verifier/pfd_stats_calculator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#pragma once

#include "config/error/type.h"
#include "config/error_measure/type.h"
#include "config/indices/type.h"
#include "model/table/column_layout_relation_data.h"
#include "model/table/position_list_index.h"

namespace algos {

class PFDStatsCalculator {
private:
std::shared_ptr<ColumnLayoutRelationData> relation_;
config::ErrorMeasureType error_measure_;

std::vector<model::PLI::Cluster> clusters_violating_pfd_;
size_t num_rows_violating_pfd_ = 0;
config::ErrorType error_ = 0.0;

public:
explicit PFDStatsCalculator(std::shared_ptr<ColumnLayoutRelationData> relation,
config::ErrorMeasureType measure)
: relation_(std::move(relation)), error_measure_(measure) {}

void ResetState() {
clusters_violating_pfd_.clear();
num_rows_violating_pfd_ = 0;
error_ = 0;
}

size_t GetNumViolatingClusters() const {
return clusters_violating_pfd_.size();
}

size_t GetNumViolatingRows() const {
return num_rows_violating_pfd_;
}

config::ErrorType GetError() const {
return error_;
}

std::vector<model::PLI::Cluster> const& GetViolatingClusters() const {
return clusters_violating_pfd_;
}

void CalculateStatistics(model::PositionListIndex const* x_pli,
model::PositionListIndex const* xa_pli) {
using Cluster = model::PLI::Cluster;
std::deque<Cluster> xa_index = xa_pli->GetIndex();
std::shared_ptr<Cluster const> probing_table = x_pli->CalculateAndGetProbingTable();
std::sort(xa_index.begin(), xa_index.end(),
[&probing_table](Cluster const& a, Cluster const& b) {
return probing_table->at(a.front()) < probing_table->at(b.front());
});
double sum = 0.0;
std::size_t cluster_rows_count = 0;
std::deque<Cluster> const& x_index = x_pli->GetIndex();
auto xa_cluster_it = xa_index.begin();

for (Cluster const& x_cluster : x_index) {
std::size_t max = 1;
std::size_t x_cluster_size = x_cluster.size();
for (int x_row : x_cluster) {
if (xa_cluster_it == xa_index.end()) {
break;
}
if (x_row == xa_cluster_it->at(0)) {
max = std::max(max, xa_cluster_it->size());
xa_cluster_it++;
}
}
if (max != x_cluster_size) {
clusters_violating_pfd_.push_back(x_cluster);
}
num_rows_violating_pfd_ += x_cluster_size - max;
sum += error_measure_ == +ErrorMeasure::per_tuple
? static_cast<double>(max)
: static_cast<double>(max) / x_cluster_size;
cluster_rows_count += x_cluster.size();
}
unsigned int unique_rows =
static_cast<unsigned int>(x_pli->GetRelationSize() - cluster_rows_count);
double probability =
static_cast<double>(sum + unique_rows) / (error_measure_ == +ErrorMeasure::per_tuple
? x_pli->GetRelationSize()
: x_index.size() + unique_rows);
error_ = 1.0 - probability;
}
};

} // namespace algos
67 changes: 67 additions & 0 deletions src/core/algorithms/fd/pfd_verifier/pfd_verifier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "algorithms/fd/pfd_verifier/pfd_verifier.h"

#include <chrono>
#include <memory>

#include "algorithms/algorithm.h"
#include "config/names.h"
#include "config/tabular_data/input_table/option.h"
#include "equal_nulls/option.h"
#include "error_measure/option.h"
#include "indices/option.h"

namespace algos {

void PFDVerifier::RegisterOptions() {
auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); };
RegisterOption(config::kTableOpt(&input_table_));
RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_));
RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols));
RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols));
RegisterOption(config::kErrorMeasureOpt(&error_measure_));
}

void PFDVerifier::MakeExecuteOptsAvailable() {
using namespace config::names;
MakeOptionsAvailable({kLhsIndices, kRhsIndices, kErrorMeasure});
}

void PFDVerifier::LoadDataInternal() {
relation_ = ColumnLayoutRelationData::CreateFrom(*input_table_, is_null_equal_null_);
if (relation_->GetColumnData().empty()) {
throw std::runtime_error("Got an empty dataset: pFD verifying is meaningless.");
}
}

unsigned long long PFDVerifier::ExecuteInternal() {
auto start_time = std::chrono::system_clock::now();
stats_calculator_ = std::make_unique<PFDStatsCalculator>(relation_, error_measure_);
VerifyPFD();
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
return elapsed_milliseconds.count();
}

void PFDVerifier::VerifyPFD() const {
std::shared_ptr<model::PLI const> lhs_pli = CalculatePLI(lhs_indices_);
std::shared_ptr<model::PLI const> rhs_pli = CalculatePLI(rhs_indices_);
std::unique_ptr<model::PLI const> intersection_pli = lhs_pli->Intersect(rhs_pli.get());
stats_calculator_->CalculateStatistics(lhs_pli.get(), intersection_pli.get());
}

std::shared_ptr<model::PLI const> PFDVerifier::CalculatePLI(
config::IndicesType const& indices) const {
std::shared_ptr<model::PLI const> pli = relation_->GetColumnData(indices[0]).GetPliOwnership();
for (size_t i = 1; i < indices.size(); ++i) {
pli = pli->Intersect(relation_->GetColumnData(indices[i]).GetPositionListIndex());
}
return pli;
}

PFDVerifier::PFDVerifier() : Algorithm({}) {
using namespace config::names;
RegisterOptions();
MakeOptionsAvailable({kTable, kEqualNulls});
}

} // namespace algos
66 changes: 66 additions & 0 deletions src/core/algorithms/fd/pfd_verifier/pfd_verifier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include <memory>
#include <vector>

#include "algorithms/algorithm.h"
#include "algorithms/fd/pfd_verifier/pfd_stats_calculator.h"
#include "algorithms/fd/tane/enums.h"
#include "config/equal_nulls/type.h"
#include "config/error/type.h"
#include "config/error_measure/type.h"
#include "config/indices/type.h"
#include "config/tabular_data/input_table_type.h"

namespace algos {

class PFDVerifier : public Algorithm {
private:
config::InputTable input_table_;

config::IndicesType lhs_indices_;
config::IndicesType rhs_indices_;
config::EqNullsType is_null_equal_null_;
config::ErrorMeasureType error_measure_ = +ErrorMeasure::per_tuple;

std::shared_ptr<ColumnLayoutRelationData> relation_;
std::unique_ptr<PFDStatsCalculator> stats_calculator_;

void ResetState() override {
if (stats_calculator_) {
stats_calculator_->ResetState();
}
}

void RegisterOptions();
void MakeExecuteOptsAvailable() override;
void LoadDataInternal() override;
unsigned long long ExecuteInternal() override;
void VerifyPFD() const;
std::shared_ptr<model::PLI const> CalculatePLI(config::IndicesType const& indices) const;

public:
size_t GetNumViolatingClusters() const {
assert(stats_calculator_);
return stats_calculator_->GetNumViolatingClusters();
}

size_t GetNumViolatingRows() const {
assert(stats_calculator_);
return stats_calculator_->GetNumViolatingRows();
}

std::vector<model::PLI::Cluster> const& GetViolatingClusters() const {
assert(stats_calculator_);
return stats_calculator_->GetViolatingClusters();
}

double GetError() const {
assert(stats_calculator_);
return stats_calculator_->GetError();
}

PFDVerifier();
};

} // namespace algos
1 change: 1 addition & 0 deletions src/core/algorithms/fd/verification_algorithms.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#pragma once

#include "algorithms/fd/fd_verifier/fd_verifier.h"
#include "algorithms/fd/pfd_verifier/pfd_verifier.h"
4 changes: 3 additions & 1 deletion src/python_bindings/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "nd/bind_nd.h"
#include "nd/bind_nd_verification.h"
#include "od/bind_od.h"
#include "pfd/bind_pfd_verification.h"
#include "statistics/bind_statistics.h"
#include "ucc/bind_ucc.h"
#include "ucc/bind_ucc_verification.h"
Expand All @@ -40,7 +41,8 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) {
for (auto bind_func :
{BindMainClasses, BindDataTypes, BindFd, BindCfd, BindAr, BindUcc, BindAc, BindOd, BindNd,
BindFdVerification, BindMfdVerification, BindUccVerification, BindStatistics, BindInd,
BindGfdVerification, BindSplit, BindDynamicFdVerification, BindNdVerification}) {
BindGfdVerification, BindSplit, BindDynamicFdVerification, BindNdVerification,
BindPfdVerification}) {
bind_func(module);
}
}
Expand Down
25 changes: 25 additions & 0 deletions src/python_bindings/pfd/bind_pfd_verification.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include "bind_pfd_verification.h"

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

#include "algorithms/fd/pfd_verifier/pfd_verifier.h"
#include "py_util/bind_primitive.h"

namespace {
namespace py = pybind11;
} // namespace

namespace python_bindings {
void BindPfdVerification(py::module_& main_module) {
using namespace algos;
auto pfd_verification_module = main_module.def_submodule("pfd_verification");

BindPrimitiveNoBase<PFDVerifier>(pfd_verification_module, "PFDVerifier")
.def("get_num_violating_clusters", &PFDVerifier::GetNumViolatingClusters)
.def("get_num_violating_rows", &PFDVerifier::GetNumViolatingRows)
.def("get_violating_clusters", &PFDVerifier::GetViolatingClusters)
.def("get_error", &PFDVerifier::GetError);
main_module.attr("pfd_verification") = pfd_verification_module;
}
} // namespace python_bindings
7 changes: 7 additions & 0 deletions src/python_bindings/pfd/bind_pfd_verification.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include <pybind11/pybind11.h>

namespace python_bindings {
void BindPfdVerification(pybind11::module_& main_module);
} // namespace python_bindings
Loading
Loading