Skip to content

Commit

Permalink
v1.7.6 add jaccard distance
Browse files Browse the repository at this point in the history
  • Loading branch information
masajiro committed Jun 26, 2019
1 parent 60d3115 commit 6cd35d6
Show file tree
Hide file tree
Showing 18 changed files with 209 additions and 38 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data
**NGT** provides commands and a library for performing high-speed approximate nearest neighbor searches against a large volume of data (several million to several 10 million items of data) in high dimensional vector data space (several ten to several thousand dimensions).

News
- 06/26/2019 Jaccard distance is available. (v1.7.6)
- 06/10/2019 PyPI NGT package v1.7.5 is now available.
- 01/17/2019 Python NGT can be installed via pip from PyPI. (v1.5.1)
- 12/14/2018 [NGTQ](bin/ngtq/README.md) (NGT with Quantization) is now available. (v1.5.0)
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.7.5
1.7.6
12 changes: 12 additions & 0 deletions lib/NGT/Capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,18 @@ bool ngt_set_property_distance_type_hamming(NGTProperty prop, NGTError error) {
return true;
}

bool ngt_set_property_distance_type_jaccard(NGTProperty prop, NGTError error) {
if(prop == NULL){
std::stringstream ss;
ss << "Capi : " << __FUNCTION__ << "() : parametor error: prop = " << prop;
operate_error_string_(ss, error);
return false;
}

(*static_cast<NGT::Property*>(prop)).distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard;
return true;
}

bool ngt_set_property_distance_type_cosine(NGTProperty prop, NGTError error) {
if(prop == NULL){
std::stringstream ss;
Expand Down
2 changes: 2 additions & 0 deletions lib/NGT/Capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ bool ngt_set_property_distance_type_angle(NGTProperty, NGTError);

bool ngt_set_property_distance_type_hamming(NGTProperty, NGTError);

bool ngt_set_property_distance_type_jaccard(NGTProperty, NGTError);

bool ngt_set_property_distance_type_cosine(NGTProperty, NGTError);

bool ngt_set_property_distance_type_normalized_angle(NGTProperty, NGTError);
Expand Down
2 changes: 1 addition & 1 deletion lib/NGT/Clustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ namespace NGT {
double csum = 0.0;
float *x = a;
float *y = b;
for (int i = 0; i < size; i++) {
for (size_t i = 0; i < size; i++) {
double d = (double)*x++ - (double)*y++;
csum += d * d;
}
Expand Down
5 changes: 4 additions & 1 deletion lib/NGT/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
const string usage = "Usage: ngt create "
"-d dimension [-p #-of-thread] [-i index-type(t|g)] [-g graph-type(a|k|b|o|i)] "
"[-t truncation-edge-limit] [-E edge-size] [-S edge-size-for-search] [-L edge-size-limit] "
"[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|c|C)] [-n #-of-inserted-objects] "
"[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|j|c|C)] [-n #-of-inserted-objects] "
"[-P path-adjustment-interval] [-B dynamic-edge-size-base] [-A object-alignment(t|f)] "
"[-T build-time-limit] [-O outgoing x incoming] "
"index(output) [data.tsv(input)]";
Expand Down Expand Up @@ -153,6 +153,9 @@
case 'h':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeHamming;
break;
case 'j':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard;
break;
case 'c':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeCosine;
break;
Expand Down
6 changes: 6 additions & 0 deletions lib/NGT/Graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ NeighborhoodGraph::Search::hammingUint8(NeighborhoodGraph &graph, NGT::SearchCon
{
graph.searchReadOnlyGraph<PrimitiveComparator::HammingUint8>(sc, seeds);
}

void
NeighborhoodGraph::Search::jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds)
{
graph.searchReadOnlyGraph<PrimitiveComparator::JaccardUint8>(sc, seeds);
}
#endif

void
Expand Down
2 changes: 2 additions & 0 deletions lib/NGT/Graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ namespace NGT {
case NGT::ObjectSpace::Uint8:
switch (dtype) {
case NGT::ObjectSpace::DistanceTypeHamming : return hammingUint8;
case NGT::ObjectSpace::DistanceTypeJaccard : return jaccardUint8;
case NGT::ObjectSpace::DistanceTypeL2 : return l2Uint8;
case NGT::ObjectSpace::DistanceTypeL1 : return l1Uint8;
default : return l2Uint8;
Expand All @@ -290,6 +291,7 @@ namespace NGT {
static void l1Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void l2Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void hammingUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void cosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void angleFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void normalizedCosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
Expand Down
3 changes: 3 additions & 0 deletions lib/NGT/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ namespace NGT {
case DistanceType::DistanceTypeL1: p.set("DistanceType", "L1"); break;
case DistanceType::DistanceTypeL2: p.set("DistanceType", "L2"); break;
case DistanceType::DistanceTypeHamming: p.set("DistanceType", "Hamming"); break;
case DistanceType::DistanceTypeJaccard: p.set("DistanceType", "Jaccard"); break;
case DistanceType::DistanceTypeAngle: p.set("DistanceType", "Angle"); break;
case DistanceType::DistanceTypeCosine: p.set("DistanceType", "Cosine"); break;
case DistanceType::DistanceTypeNormalizedAngle: p.set("DistanceType", "NormalizedAngle"); break;
Expand Down Expand Up @@ -169,6 +170,8 @@ namespace NGT {
distanceType = DistanceType::DistanceTypeL2;
} else if (it->second == "Hamming") {
distanceType = DistanceType::DistanceTypeHamming;
} else if (it->second == "Jaccard") {
distanceType = DistanceType::DistanceTypeJaccard;
} else if (it->second == "Angle") {
distanceType = DistanceType::DistanceTypeAngle;
} else if (it->second == "Cosine") {
Expand Down
3 changes: 2 additions & 1 deletion lib/NGT/ObjectSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ namespace NGT {
DistanceTypeAngle = 3,
DistanceTypeCosine = 4,
DistanceTypeNormalizedAngle = 5,
DistanceTypeNormalizedCosine = 6
DistanceTypeNormalizedCosine = 6,
DistanceTypeJaccard = 7
};

enum ObjectType {
Expand Down
24 changes: 24 additions & 0 deletions lib/NGT/ObjectSpaceRepository.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,27 @@ namespace NGT {
#endif
};

class ComparatorJaccardDistance : public Comparator {
public:
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
ComparatorJaccardDistance(size_t d, SharedMemoryAllocator &a) : Comparator(d, a) {}
double operator()(Object &objecta, Object &objectb) {
return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension);
}
double operator()(Object &objecta, PersistentObject &objectb) {
return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb.at(0, allocator), dimension);
}
double operator()(PersistentObject &objecta, PersistentObject &objectb) {
return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta.at(0, allocator), (OBJECT_TYPE*)&objectb.at(0, allocator), dimension);
}
#else
ComparatorJaccardDistance(size_t d) : Comparator(d) {}
double operator()(Object &objecta, Object &objectb) {
return PrimitiveComparator::compareJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension);
}
#endif
};

class ComparatorAngleDistance : public Comparator {
public:
#ifdef NGT_SHARED_MEMORY_ALLOCATOR
Expand Down Expand Up @@ -278,6 +299,9 @@ namespace NGT {
case DistanceTypeHamming:
comparator = new ObjectSpaceRepository::ComparatorHammingDistance(ObjectSpace::getPaddedDimension());
break;
case DistanceTypeJaccard:
comparator = new ObjectSpaceRepository::ComparatorJaccardDistance(ObjectSpace::getPaddedDimension());
break;
case DistanceTypeAngle:
comparator = new ObjectSpaceRepository::ComparatorAngleDistance(ObjectSpace::getPaddedDimension());
break;
Expand Down
103 changes: 72 additions & 31 deletions lib/NGT/PrimitiveComparator.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@ namespace NGT {
}
#endif


#if defined(NGT_COMPARATOR_NO_AVX) || !defined(__POPCNT__)
inline static double popCount(uint32_t x) {
x = (x & 0x55555555) + (x >> 1 & 0x55555555);
Expand All @@ -326,41 +325,76 @@ namespace NGT {

template <typename OBJECT_TYPE>
inline static double compareHammingDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
const OBJECT_TYPE *last = a + size;
const uint32_t *last = reinterpret_cast<const uint32_t*>(a + size);

OBJECT_TYPE *uinta = (OBJECT_TYPE*)a;
OBJECT_TYPE *uintb = (OBJECT_TYPE*)b;
const uint32_t *uinta = reinterpret_cast<const uint32_t*>(a);
const uint32_t *uintb = reinterpret_cast<const uint32_t*>(b);
size_t count = 0;
while( uinta < (OBJECT_TYPE*)last ){
count += popCount(*(uint32_t*)uinta ^ *(uint32_t*)uintb);
uinta += 4;
uintb += 4;
while( uinta < last ){
count += popCount(*uinta++ ^ *uintb++);
}

return (double)count;
return static_cast<double>(count);
}
#else
template <typename OBJECT_TYPE>
inline static double compareHammingDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
const OBJECT_TYPE *last = a + size;
const uint64_t *last = reinterpret_cast<const uint64_t*>(a + size);

uint64_t *uinta = (uint64_t*)a;
uint64_t *uintb = (uint64_t*)b;
const uint64_t *uinta = reinterpret_cast<const uint64_t*>(a);
const uint64_t *uintb = reinterpret_cast<const uint64_t*>(b);
size_t count = 0;
while( uinta < (uint64_t*)last ){
while( uinta < last ){
count += _mm_popcnt_u64(*uinta++ ^ *uintb++);
count += _mm_popcnt_u64(*uinta++ ^ *uintb++);
}

return (double)count;
return static_cast<double>(count);
}
#endif

#if defined(NGT_COMPARATOR_NO_AVX) || !defined(__POPCNT__)
template <typename OBJECT_TYPE>
inline static double compareJaccardDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
const uint32_t *last = reinterpret_cast<const uint32_t*>(a + size);

const uint32_t *uinta = reinterpret_cast<const uint32_t*>(a);
const uint32_t *uintb = reinterpret_cast<const uint32_t*>(b);
size_t count = 0;
size_t countDe = 0;
while( uinta < last ){
count += popCount(*uinta & *uintb);
countDe += popCount(*uinta++ | *uintb++);
count += popCount(*uinta & *uintb);
countDe += popCount(*uinta++ | *uintb++);
}

return 1.0 - static_cast<double>(count) / static_cast<double>(countDe);
}
#else
template <typename OBJECT_TYPE>
inline static double compareJaccardDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
const uint64_t *last = reinterpret_cast<const uint64_t*>(a + size);

const uint64_t *uinta = reinterpret_cast<const uint64_t*>(a);
const uint64_t *uintb = reinterpret_cast<const uint64_t*>(b);
size_t count = 0;
size_t countDe = 0;
while( uinta < last ){
count += _mm_popcnt_u64(*uinta & *uintb);
countDe += _mm_popcnt_u64(*uinta++ | *uintb++);
count += _mm_popcnt_u64(*uinta & *uintb);
countDe += _mm_popcnt_u64(*uinta++ | *uintb++);
}

return 1.0 - static_cast<double>(count) / static_cast<double>(countDe);
}
#endif


#if defined(NGT_COMPARATOR_NO_AVX)
template <typename OBJECT_TYPE>
inline static double compareDotProduct(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
double sum = 0.0F;
double sum = 0.0;
for (size_t loc = 0; loc < size; loc++) {
sum += (double)a[loc] * (double)b[loc];
}
Expand All @@ -369,9 +403,9 @@ namespace NGT {

template <typename OBJECT_TYPE>
inline static double compareCosine(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
double normA = 0.0F;
double normB = 0.0F;
double sum = 0.0F;
double normA = 0.0;
double normB = 0.0;
double sum = 0.0;
for (size_t loc = 0; loc < size; loc++) {
normA += (double)a[loc] * (double)a[loc];
normB += (double)b[loc] * (double)b[loc];
Expand Down Expand Up @@ -432,7 +466,7 @@ namespace NGT {
}

inline static double compareDotProduct(const unsigned char *a, const unsigned char *b, size_t size) {
double sum = 0.0F;
double sum = 0.0;
for (size_t loc = 0; loc < size; loc++) {
sum += (double)a[loc] * (double)b[loc];
}
Expand Down Expand Up @@ -479,9 +513,9 @@ namespace NGT {
}

inline static double compareCosine(const unsigned char *a, const unsigned char *b, size_t size) {
double normA = 0.0F;
double normB = 0.0F;
double sum = 0.0F;
double normA = 0.0;
double normB = 0.0;
double sum = 0.0;
for (size_t loc = 0; loc < size; loc++) {
normA += (double)a[loc] * (double)a[loc];
normB += (double)b[loc] * (double)b[loc];
Expand All @@ -497,10 +531,10 @@ namespace NGT {
template <typename OBJECT_TYPE>
inline static double compareAngleDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
double cosine = compareCosine(a, b, size);
if (cosine >= 1.0F) {
return 0.0F;
} else if (cosine <= -1.0F) {
return acos(-1.0F);
if (cosine >= 1.0) {
return 0.0;
} else if (cosine <= -1.0) {
return acos(-1.0);
} else {
return acos(cosine);
}
Expand All @@ -509,10 +543,10 @@ namespace NGT {
template <typename OBJECT_TYPE>
inline static double compareNormalizedAngleDistance(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) {
double cosine = compareDotProduct(a, b, size);
if (cosine >= 1.0F) {
return 0.0F;
} else if (cosine <= -1.0F) {
return acos(-1.0F);
if (cosine >= 1.0) {
return 0.0;
} else if (cosine <= -1.0) {
return acos(-1.0);
} else {
return acos(cosine);
}
Expand Down Expand Up @@ -550,6 +584,13 @@ namespace NGT {
}
};

class JaccardUint8 {
public:
inline static double compare(const void *a, const void *b, size_t size) {
return PrimitiveComparator::compareJaccardDistance((const uint8_t*)a, (const uint8_t*)b, size);
}
};

class L2Float {
public:
inline static double compare(const void *a, const void *b, size_t size) {
Expand Down
1 change: 1 addition & 0 deletions python/README-ngtpy-jp.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ FUNCTIONS
- __Cosine__: コサイン類似度
- __Normalized Cosine__: 正規化コサイン類似度。指定されたデータは自動的に正規化された上でインデックスに登録されます。
- __Hamming__: ハミング距離
- __Jaccard__: ジャッカード距離

**object\_type**
オブジェクトのデータタイプを指定します。
Expand Down
1 change: 1 addition & 0 deletions python/README-ngtpy.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ Specifies the distance function for the objects.
- __Cosine__: Cosine similarity
- __Normalized Cosine__: Normalized cosine similarity. The specified data are automatically normalized to be appended to the index.
- __Hamming__: Hamming distance
- __Jaccard__: Jaccard distance

**object\_type**
Specifies the data type of the objects.
Expand Down
2 changes: 1 addition & 1 deletion python/ngt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ create an empty index with the specified parameters.
edge_size_for_creation : Number of edges for each node in the graph.
edge_size_for_search : Number of edges to search.
object_type : Type of the data object. (Float, Integer [Integer is 1 byte])
distance_type : Type of the distance function. (L1,L2,Angle,Hamming)
distance_type : Type of the distance function. (L1,L2,Angle,Hamming,Jaccard)

<h2 id="ngt.base.Index.save">save</h2>

Expand Down
6 changes: 6 additions & 0 deletions python/ngt/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def __repr__(self):

__ngt.ngt_set_property_distance_type_hamming.argtypes = [c_void_p, c_void_p]

__ngt.ngt_set_property_distance_type_jaccard.argtypes = [c_void_p, c_void_p]

__ngt.ngt_set_property_distance_type_cosine.argtypes = [c_void_p, c_void_p]

__ngt.ngt_create_empty_results.argtype = [c_void_p]
Expand Down Expand Up @@ -249,6 +251,10 @@ def create(path, dimension,
stat = Index.__ngt.ngt_set_property_distance_type_hamming(
prop, err)
Index._check_error_num(stat, err)
elif distance_type == "Jaccard":
stat = Index.__ngt.ngt_set_property_distance_type_jaccard(
prop, err)
Index._check_error_num(stat, err)
elif distance_type == "Cosine":
stat = Index.__ngt.ngt_set_property_distance_type_cosine(
prop, err)
Expand Down
Loading

0 comments on commit 6cd35d6

Please sign in to comment.