This repository has been archived by the owner on Aug 16, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathindex.go
122 lines (103 loc) · 3.31 KB
/
index.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package gann
import (
"sync"
"github.com/google/uuid"
"github.com/mathetake/gann/metric"
)
// Index is the interface of gann's search index. GetANNbyItemID and GetANNbyVector are different in the form of query.
// GetANNbyItemID can be executed by passing a certain item's id contained in the list of items used in the index building phase.
// GetANNbyVector allows us to pass any vector of proper dimension.
//
// searchNum is the number of requested approximated nearest neighbors, and bucketScale can be tuned to make balance between
// the search result's accuracy and computational complexity in the search phase.
//
// see README.md for more details.
type Index interface {
// GetANNbyItemID ... search approximate nearest neighbors by a given itemID
GetANNbyItemID(id int64, searchNum int, bucketScale float64) (ann []int64, err error)
// GetANNbyVector ... search approximate nearest neighbors by a given query vector
GetANNbyVector(v []float64, searchNum int, bucketScale float64) (ann []int64, err error)
}
type index struct {
metric metric.Metric
// dim ... dimension of the target space
dim int
// k ... maximum # of items in a single leaf node
k int
// itemIDToItem ... ItemIDToItem
itemIDToItem map[itemId]*item
// nodeIDToNode ... NodeIDToNode
nodeIDToNode map[nodeId]*node
// roots ... roots of the trees
roots []*node
mux *sync.Mutex
}
// CreateNewIndex build a new search index for given vectors. rawItems should consist of search target vectors and
// its slice index corresponds to the first argument id of GetANNbyItemID. For example, if we want to search approximate
// nearest neighbors of rawItems[3], it can simply achieved by calling index.GetANNbyItemID(3, ...).
//
// dim is the dimension of target spaces. nTree and k are tunable parameters which affects performances of
// the index (see README.md for details.)
//
// The last argument m is type of metric.Metric and represents the metric of the target search space.
// See https://godoc.org/github.com/mathetake/gann/metric for details.
func CreateNewIndex(rawItems [][]float64, dim, nTree, k int, m metric.Metric) (Index, error) {
// verify that given items have same dimension
for _, it := range rawItems {
if len(it) != dim {
return nil, errDimensionMismatch
}
}
if len(rawItems) < 2 {
return nil, errNotEnoughItems
}
its := make([]*item, len(rawItems))
idToItem := make(map[itemId]*item, len(rawItems))
for i, v := range rawItems {
it := &item{
id: itemId(i),
vector: v,
}
its[i] = it
idToItem[it.id] = it
}
idx := &index{
metric: m,
dim: dim,
k: k,
itemIDToItem: idToItem,
roots: make([]*node, nTree),
nodeIDToNode: map[nodeId]*node{},
mux: &sync.Mutex{},
}
// build
idx.build(its, nTree)
return idx, nil
}
func (idx *index) build(items []*item, nTree int) {
vs := make([][]float64, len(idx.itemIDToItem))
for i, it := range items {
vs[i] = it.vector
}
for i := 0; i < nTree; i++ {
nv := idx.metric.GetSplittingVector(vs)
rn := &node{
id: nodeId(uuid.New().String()),
vec: nv,
idxPtr: idx,
children: map[direction]*node{},
}
idx.roots[i] = rn
idx.nodeIDToNode[rn.id] = rn
}
var wg sync.WaitGroup
wg.Add(nTree)
for _, rn := range idx.roots {
rn := rn
go func() {
defer wg.Done()
rn.build(items)
}()
}
wg.Wait()
}