Skip to content

Commit

Permalink
Merge pull request #155 from apache/patch_for_rc4
Browse files Browse the repository at this point in the history
Apply changes to create rc4
  • Loading branch information
jmalkin authored Jun 3, 2020
2 parents dbc8b4a + 6f04d3f commit 90b6689
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 47 deletions.
2 changes: 2 additions & 0 deletions theta/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ set(theta_HEADERS "")
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/binomial_bounds.hpp;include/theta_sketch_impl.hpp")
list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
list(APPEND theta_HEADERS "include/conditional_back_inserter.hpp")

install(TARGETS theta
EXPORT ${PROJECT_NAME}
Expand All @@ -55,4 +56,5 @@ target_sources(theta
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
)
63 changes: 63 additions & 0 deletions theta/include/conditional_back_inserter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef CONDITIONAL_BACK_INSERTER_HPP_
#define CONDITIONAL_BACK_INSERTER_HPP_

#include <iterator>
#include <functional>

namespace datasketches {

template <typename Container, typename Predicate>
class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
public:
template<typename P>
conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}

// MSVC seems to insist on having copy constructor and assignment
conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
std::back_insert_iterator<Container>(other), p(other.p) {}
conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
std::back_insert_iterator<Container>::operator=(other);
p = other.p;
return *this;
}

conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
if (p(value)) std::back_insert_iterator<Container>::operator=(value);
return *this;
}

conditional_back_insert_iterator& operator*() { return *this; }
conditional_back_insert_iterator& operator++() { return *this; }
conditional_back_insert_iterator& operator++(int) { return *this; }

private:
Predicate p;
};

template<typename Container, typename Predicate>
conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
}

} /* namespace datasketches */

#endif
7 changes: 7 additions & 0 deletions theta/include/theta_a_not_b.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ class theta_a_not_b_alloc {
typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
uint16_t seed_hash_;

class less_than {
public:
explicit less_than(uint64_t value): value(value) {}
bool operator()(uint64_t value) const { return value < this->value; }
private:
uint64_t value;
};
};

// alias with default allocator for convenience
Expand Down
67 changes: 25 additions & 42 deletions theta/include/theta_a_not_b_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include <algorithm>

#include "conditional_back_inserter.hpp"

namespace datasketches {

/*
Expand All @@ -37,61 +39,42 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))

template<typename A>
compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);

const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
vector_u64<A> keys;
uint32_t keys_size = 0;
uint32_t count = 0;
bool is_empty = a.is_empty();

if (b.get_num_retained() == 0) {
for (auto key: a) if (key < theta) ++count;
keys.resize(count);
std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
}

keys_size = a.get_num_retained();
keys.resize(keys_size);

if (a.is_ordered() && b.is_ordered()) { // sort-based
const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
count = end - keys.begin();
} else { // hash-based
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
vector_u64<A> b_hash_table(1 << lg_size, 0);
for (auto key: b) {
if (key < theta) {
update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
} else if (b.is_ordered()) {
break; // early stop
std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
} else {
if (a.is_ordered() && b.is_ordered()) { // sort-based
std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
} else { // hash-based
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
vector_u64<A> b_hash_table(1 << lg_size, 0);
for (auto key: b) {
if (key < theta) {
update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
} else if (b.is_ordered()) {
break; // early stop
}
}
}

// scan A lookup B
for (auto key: a) {
if (key < theta) {
if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
} else if (a.is_ordered()) {
break; // early stop
// scan A lookup B
for (auto key: a) {
if (key < theta) {
if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
} else if (a.is_ordered()) {
break; // early stop
}
}
}
}

if (count == 0) {
keys.resize(0);
if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
} else if (count < keys_size) {
keys.resize(count);
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
}

if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
}

Expand Down
2 changes: 1 addition & 1 deletion theta/include/theta_intersection_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
template<typename A>
void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
if (is_empty_) return;
if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
is_empty_ |= sketch.is_empty();
theta_ = std::min(theta_, sketch.get_theta64());
if (is_valid_ && num_keys_ == 0) return;
Expand Down
14 changes: 10 additions & 4 deletions theta/include/theta_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
is.read((char*)&seed_hash, sizeof(seed_hash));

check_serial_version(serial_version, SERIAL_VERSION);
check_seed_hash(seed_hash, get_seed_hash(seed));

if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
Expand All @@ -114,6 +114,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(st
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
Expand Down Expand Up @@ -146,9 +148,9 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));

check_serial_version(serial_version, SERIAL_VERSION);
check_seed_hash(seed_hash, get_seed_hash(seed));

if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
Expand All @@ -161,6 +163,8 @@ typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(co
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
Expand Down Expand Up @@ -753,7 +757,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
is.read((char*)&seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
}

Expand Down Expand Up @@ -801,7 +806,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
}

Expand Down
24 changes: 24 additions & 0 deletions theta/test/theta_a_not_b_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,28 @@ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") {
REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
}

TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
update_theta_sketch a = update_theta_sketch::builder().build();
int value = 0;
for (int i = 0; i < 10000; i++) a.update(value++);

update_theta_sketch b = update_theta_sketch::builder().build();
value = 5000;
for (int i = 0; i < 25000; i++) b.update(value++);

theta_a_not_b a_not_b;

// unordered inputs
compact_theta_sketch result = a_not_b.compute(a, b);
REQUIRE_FALSE(result.is_empty());
REQUIRE(result.is_estimation_mode());
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));

// ordered inputs
result = a_not_b.compute(a.compact(), b.compact());
REQUIRE_FALSE(result.is_empty());
REQUIRE(result.is_estimation_mode());
REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
}

} /* namespace datasketches */
Binary file modified theta/test/theta_compact_empty_from_java.sk
Binary file not shown.

0 comments on commit 90b6689

Please sign in to comment.