From c970e136799e80d6a1279132d73414f3dadcca98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Sun, 3 Mar 2024 21:47:23 +0100 Subject: [PATCH 1/5] Export Document struct and constructor --- collection.go | 8 ++++---- db.go | 4 ++-- document.go | 54 ++++++++++++++++++++++++++++++++----------------- example/main.go | 4 ++-- query.go | 26 ++++++++++++------------ 5 files changed, 57 insertions(+), 39 deletions(-) diff --git a/collection.go b/collection.go index 4e45f2a..a7d2af4 100644 --- a/collection.go +++ b/collection.go @@ -19,7 +19,7 @@ type Collection struct { persistDirectory string metadata map[string]string - documents map[string]*document + documents map[string]*Document documentsLock sync.RWMutex embed EmbeddingFunc } @@ -38,7 +38,7 @@ func newCollection(name string, metadata map[string]string, embed EmbeddingFunc, Name: name, metadata: m, - documents: make(map[string]*document), + documents: make(map[string]*Document), embed: embed, } @@ -221,14 +221,14 @@ func (c *Collection) add(ctx context.Context, ids []string, documents []string, } func (c *Collection) addRow(ctx context.Context, id string, document string, embedding []float32, metadata map[string]string) error { - doc, err := newDocument(ctx, id, embedding, metadata, document, c.embed) + doc, err := NewDocument(ctx, id, metadata, embedding, document, c.embed) if err != nil { return fmt.Errorf("couldn't create document '%s': %w", id, err) } c.documentsLock.Lock() // We don't defer the unlock because we want to do it earlier. - c.documents[id] = doc + c.documents[id] = &doc c.documentsLock.Unlock() // Persist the document diff --git a/db.go b/db.go index 0f9ac92..4662b9b 100644 --- a/db.go +++ b/db.go @@ -91,7 +91,7 @@ func NewPersistentDB(path string) (*DB, error) { c := &Collection{ // We can fill Name, persistDirectory and metadata only after reading // the metadata. - documents: make(map[string]*document), + documents: make(map[string]*Document), // We can fill embed only when the user calls DB.GetCollection() or // DB.GetOrCreateCollection(). } @@ -119,7 +119,7 @@ func NewPersistentDB(path string) (*DB, error) { c.metadata = pc.Metadata } else if filepath.Ext(collectionDirEntry.Name()) == ".gob" { // Read document - d := &document{} + d := &Document{} err := read(fPath, d) if err != nil { return nil, fmt.Errorf("couldn't read document: %w", err) diff --git a/document.go b/document.go index 49b8a38..857d5c4 100644 --- a/document.go +++ b/document.go @@ -2,26 +2,45 @@ package chromem import ( "context" + "errors" ) -type document struct { - ID string - Metadata map[string]string - Document string - - Vectors []float32 +// Document represents a single document. +type Document struct { + ID string + Metadata map[string]string + Embedding []float32 + Content string } -// newDocument creates a new document, including its embeddings. +// NewDocument creates a new document, including its embeddings. +// Metadata is optional. // If the embeddings are not provided, they are created using the embedding function. -func newDocument(ctx context.Context, id string, embeddings []float32, metadata map[string]string, doc string, embed EmbeddingFunc) (*document, error) { - if len(embeddings) == 0 { - vectors, err := embed(ctx, doc) +// You can leave the content empty if you only want to store embeddings. +// If embeddingFunc is nil, the default embedding function is used. +// +// If you want to create a document without embeddings, for example to let [Collection.AddDocuments] +// create them concurrently, you can create a document with `chromem.Document{...}` +// instead of using this constructor. +func NewDocument(ctx context.Context, id string, metadata map[string]string, embedding []float32, content string, embeddingFunc EmbeddingFunc) (Document, error) { + if id == "" { + return Document{}, errors.New("id is empty") + } + if len(embedding) == 0 && content == "" { + return Document{}, errors.New("either embedding or content must be filled") + } + if embeddingFunc == nil { + embeddingFunc = NewEmbeddingFuncDefault() + } + + if len(embedding) == 0 { + var err error + embedding, err = embeddingFunc(ctx, content) if err != nil { - return nil, err + return Document{}, err } - embeddings = vectors } + // We copy the metadata to avoid data races in case the caller modifies the // map after creating the document while we range over it. m := make(map[string]string, len(metadata)) @@ -29,11 +48,10 @@ func newDocument(ctx context.Context, id string, embeddings []float32, metadata m[k] = v } - return &document{ - ID: id, - Metadata: metadata, - Document: doc, - - Vectors: embeddings, + return Document{ + ID: id, + Metadata: metadata, + Embedding: embedding, + Content: content, }, nil } diff --git a/example/main.go b/example/main.go index 0e5e632..6068ee7 100644 --- a/example/main.go +++ b/example/main.go @@ -105,12 +105,12 @@ func main() { // Print the retrieved documents and their similarity to the question. for i, res := range docRes { - log.Printf("Document %d (similarity: %f): \"%s\"\n", i+1, res.Similarity, res.Document) + log.Printf("Document %d (similarity: %f): \"%s\"\n", i+1, res.Similarity, res.Content) } // Now we can ask the LLM again, augmenting the question with the knowledge we retrieved. // In this example we just use both retrieved documents as context. - contexts := []string{docRes[0].Document, docRes[1].Document} + contexts := []string{docRes[0].Content, docRes[1].Content} log.Println("Asking LLM with augmented question...") reply = askLLM(ctx, contexts, question) log.Printf("Reply after augmenting the question with knowledge: \"" + reply + "\"\n") diff --git a/query.go b/query.go index 317a729..ab6b832 100644 --- a/query.go +++ b/query.go @@ -12,9 +12,9 @@ var supportedFilters = []string{"$contains", "$not_contains"} // Result represents a single result from a query. type Result struct { ID string - Embedding []float32 Metadata map[string]string - Document string + Embedding []float32 + Content string // The cosine similarity between the query and the document. // The higher the value, the more similar the document is to the query. @@ -24,8 +24,8 @@ type Result struct { // filterDocs filters a map of documents by metadata and content. // It does this concurrently. -func filterDocs(docs map[string]*document, where, whereDocument map[string]string) []*document { - filteredDocs := make([]*document, 0, len(docs)) +func filterDocs(docs map[string]*Document, where, whereDocument map[string]string) []*Document { + filteredDocs := make([]*Document, 0, len(docs)) filteredDocsLock := sync.Mutex{} // Determine concurrency. Use number of docs or CPUs, whichever is smaller. @@ -36,7 +36,7 @@ func filterDocs(docs map[string]*document, where, whereDocument map[string]strin concurrency = numDocs } - docChan := make(chan *document, concurrency*2) + docChan := make(chan *Document, concurrency*2) wg := sync.WaitGroup{} for i := 0; i < concurrency; i++ { @@ -65,7 +65,7 @@ func filterDocs(docs map[string]*document, where, whereDocument map[string]strin // documentMatchesFilters checks if a document matches the given filters. // When calling this function, the whereDocument keys must already be validated! -func documentMatchesFilters(document *document, where, whereDocument map[string]string) bool { +func documentMatchesFilters(document *Document, where, whereDocument map[string]string) bool { // A document's metadata must have *all* the fields in the where clause. for k, v := range where { // TODO: Do we want to check for existence of the key? I.e. should @@ -80,11 +80,11 @@ func documentMatchesFilters(document *document, where, whereDocument map[string] for k, v := range whereDocument { switch k { case "$contains": - if !strings.Contains(document.Document, v) { + if !strings.Contains(document.Content, v) { return false } case "$not_contains": - if strings.Contains(document.Document, v) { + if strings.Contains(document.Content, v) { return false } default: @@ -97,7 +97,7 @@ func documentMatchesFilters(document *document, where, whereDocument map[string] return true } -func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*document) ([]Result, error) { +func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Document) ([]Result, error) { res := make([]Result, len(docs)) resLock := sync.Mutex{} @@ -112,7 +112,7 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*docu ctx, cancel := context.WithCancelCause(ctx) defer cancel(nil) - docChan := make(chan *document, concurrency*2) + docChan := make(chan *Document, concurrency*2) var globalErr error globalErrLock := sync.Mutex{} @@ -127,7 +127,7 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*docu return } - sim, err := cosineSimilarity(queryVectors, doc.Vectors) + sim, err := cosineSimilarity(queryVectors, doc.Embedding) if err != nil { globalErrLock.Lock() defer globalErrLock.Unlock() @@ -144,9 +144,9 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*docu // We don't defer the unlock because we want to unlock much earlier. res = append(res, Result{ ID: doc.ID, - Embedding: doc.Vectors, Metadata: doc.Metadata, - Document: doc.Document, + Embedding: doc.Embedding, + Content: doc.Content, Similarity: sim, }) From 236b6a9d05dd85ea6fa288ef1e1b6cb9bbc7a782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Sun, 3 Mar 2024 21:47:40 +0100 Subject: [PATCH 2/5] Add/export Collection.AddDocuments and AddDocument Taking Document objects instead of individual ids/metadata/embeddings/content parameters. Move doc metadata copying from constructor to addition to collection, as we don't use the map before it's added to the collection --- collection.go | 236 +++++++++++++++++++++++++++++++------------------- document.go | 7 -- 2 files changed, 145 insertions(+), 98 deletions(-) diff --git a/collection.go b/collection.go index a7d2af4..0a1fd64 100644 --- a/collection.go +++ b/collection.go @@ -78,19 +78,161 @@ func newCollection(name string, metadata map[string]string, embed EmbeddingFunc, // you can filter on this metadata. Optional. // - documents: The documents to associate with the embeddings. // -// A row-based API will be added when Chroma adds it (they already plan to). +// This is a Chroma-like method. For a more Go-idiomatic one, see [AddDocuments]. func (c *Collection) Add(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, documents []string) error { - return c.add(ctx, ids, documents, embeddings, metadatas, 1) + return c.AddConcurrently(ctx, ids, embeddings, metadatas, documents, 1) } // AddConcurrently is like Add, but adds embeddings concurrently. // This is mostly useful when you don't pass any embeddings so they have to be created. // Upon error, concurrently running operations are canceled and the error is returned. +// +// This is a Chroma-like method. For a more Go-idiomatic one, see [AddDocuments]. func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, documents []string, concurrency int) error { + if len(ids) == 0 { + return errors.New("ids are empty") + } + if len(embeddings) == 0 && len(documents) == 0 { + return errors.New("either embeddings or documents must be filled") + } + if len(embeddings) != 0 { + if len(embeddings) != len(ids) { + return errors.New("ids and embeddings must have the same length") + } + } else { + // Assign empty slice so we can simply access via index later + embeddings = make([][]float32, len(ids)) + } + if len(metadatas) != 0 && len(ids) != len(metadatas) { + return errors.New("ids, metadatas and documents must have the same length") + } + if len(documents) != 0 { + if len(documents) != len(ids) { + return errors.New("ids and documents must have the same length") + } + } else { + // Assign empty slice so we can simply access via index later + documents = make([]string, len(ids)) + } if concurrency < 1 { return errors.New("concurrency must be at least 1") } - return c.add(ctx, ids, documents, embeddings, metadatas, concurrency) + + // Convert Chroma-style parameters into a slice of documents. + docs := make([]Document, 0, len(ids)) + for i, id := range ids { + docs = append(docs, Document{ + ID: id, + Metadata: metadatas[i], + Embedding: embeddings[i], + Content: documents[i], + }) + } + + return c.AddDocuments(ctx, docs, concurrency) +} + +// AddDocuments adds documents to the collection with the specified concurrency. +// If the documents don't have embeddings, they will be created using the collection's +// embedding function. +// Upon error, concurrently running operations are canceled and the error is returned. +func (c *Collection) AddDocuments(ctx context.Context, documents []Document, concurrency int) error { + if len(documents) == 0 { + // TODO: Should this be a no-op instead? + return errors.New("documents slice is nil or empty") + } + if concurrency < 1 { + return errors.New("concurrency must be at least 1") + } + // For other validations we rely on AddDocument. + + var globalErr error + globalErrLock := sync.Mutex{} + ctx, cancel := context.WithCancelCause(ctx) + defer cancel(nil) + setGlobalErr := func(err error) { + globalErrLock.Lock() + defer globalErrLock.Unlock() + // Another goroutine might have already set the error. + if globalErr == nil { + globalErr = err + // Cancel the operation for all other goroutines. + cancel(globalErr) + } + } + + var wg sync.WaitGroup + semaphore := make(chan struct{}, concurrency) + for _, doc := range documents { + wg.Add(1) + go func(doc Document) { + defer wg.Done() + + // Don't even start if another goroutine already failed. + if ctx.Err() != nil { + return + } + + // Wait here while $concurrency other goroutines are creating documents. + semaphore <- struct{}{} + defer func() { <-semaphore }() + + err := c.AddDocument(ctx, doc) + if err != nil { + setGlobalErr(fmt.Errorf("couldn't add document '%s': %w", doc.ID, err)) + return + } + }(doc) + } + + wg.Wait() + + return globalErr +} + +// AddDocument adds a document to the collection. +// If the document doesn't have an embedding, it will be created using the collection's +// embedding function. +func (c *Collection) AddDocument(ctx context.Context, doc Document) error { + if doc.ID == "" { + return errors.New("document ID is empty") + } + if len(doc.Embedding) == 0 && doc.Content == "" { + return errors.New("either document embedding or content must be filled") + } + + // We copy the metadata to avoid data races in case the caller modifies the + // map after creating the document while we range over it. + m := make(map[string]string, len(doc.Metadata)) + for k, v := range doc.Metadata { + m[k] = v + } + + // Create embedding if they don't exist + if len(doc.Embedding) == 0 { + embedding, err := c.embed(ctx, doc.Content) + if err != nil { + return fmt.Errorf("couldn't create embedding of document: %w", err) + } + doc.Embedding = embedding + } + + c.documentsLock.Lock() + // We don't defer the unlock because we want to do it earlier. + c.documents[doc.ID] = &doc + c.documentsLock.Unlock() + + // Persist the document + if c.persistDirectory != "" { + safeID := hash2hex(doc.ID) + filePath := path.Join(c.persistDirectory, safeID) + err := persist(filePath, doc) + if err != nil { + return fmt.Errorf("couldn't persist document: %w", err) + } + } + + return nil } // Count returns the number of documents in the collection. @@ -155,91 +297,3 @@ func (c *Collection) Query(ctx context.Context, queryText string, nResults int, // Return the top nResults return res[:nResults], nil } - -func (c *Collection) add(ctx context.Context, ids []string, documents []string, embeddings [][]float32, metadatas []map[string]string, concurrency int) error { - if len(ids) == 0 || len(documents) == 0 { - return errors.New("ids and documents must not be empty") - } - if len(ids) != len(documents) { - return errors.New("ids and documents must have the same length") - } - if len(embeddings) != 0 && len(ids) != len(embeddings) { - return errors.New("ids, embeddings and documents must have the same length") - } - if len(metadatas) != 0 && len(ids) != len(metadatas) { - return errors.New("ids, metadatas and documents must have the same length") - } - - ctx, cancel := context.WithCancelCause(ctx) - defer cancel(nil) - - var wg sync.WaitGroup - var globalErr error - var globalErrLock sync.Mutex - semaphore := make(chan struct{}, concurrency) - for i, document := range documents { - var embedding []float32 - var metadata map[string]string - if len(embeddings) != 0 { - embedding = embeddings[i] - } - if len(metadatas) != 0 { - metadata = metadatas[i] - } - - wg.Add(1) - go func(id string, embedding []float32, metadata map[string]string, document string) { - defer wg.Done() - - // Don't even start if we already have an error - if ctx.Err() != nil { - return - } - - // Wait here while $concurrency other goroutines are creating documents. - semaphore <- struct{}{} - defer func() { <-semaphore }() - - err := c.addRow(ctx, id, document, embedding, metadata) - if err != nil { - globalErrLock.Lock() - defer globalErrLock.Unlock() - // Another goroutine might have already set the error. - if globalErr == nil { - globalErr = err - // Cancel the operation for all other goroutines. - cancel(globalErr) - } - return - } - }(ids[i], embedding, metadata, document) - } - - wg.Wait() - - return globalErr -} - -func (c *Collection) addRow(ctx context.Context, id string, document string, embedding []float32, metadata map[string]string) error { - doc, err := NewDocument(ctx, id, metadata, embedding, document, c.embed) - if err != nil { - return fmt.Errorf("couldn't create document '%s': %w", id, err) - } - - c.documentsLock.Lock() - // We don't defer the unlock because we want to do it earlier. - c.documents[id] = &doc - c.documentsLock.Unlock() - - // Persist the document - if c.persistDirectory != "" { - safeID := hash2hex(id) - filePath := path.Join(c.persistDirectory, safeID) - err := persist(filePath, doc) - if err != nil { - return fmt.Errorf("couldn't persist document: %w", err) - } - } - - return nil -} diff --git a/document.go b/document.go index 857d5c4..4fe1f54 100644 --- a/document.go +++ b/document.go @@ -41,13 +41,6 @@ func NewDocument(ctx context.Context, id string, metadata map[string]string, emb } } - // We copy the metadata to avoid data races in case the caller modifies the - // map after creating the document while we range over it. - m := make(map[string]string, len(metadata)) - for k, v := range metadata { - m[k] = v - } - return Document{ ID: id, Metadata: metadata, From a4fd27914905f5b05a0043f306b12c310957ba0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Sun, 3 Mar 2024 20:09:32 +0100 Subject: [PATCH 3/5] Refactor Query() concurrent error handling To match the refactoring in Collection.AddDocuments from the previous commit --- query.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/query.go b/query.go index ab6b832..4d15f60 100644 --- a/query.go +++ b/query.go @@ -2,6 +2,7 @@ package chromem import ( "context" + "fmt" "runtime" "strings" "sync" @@ -109,14 +110,23 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Docu concurrency = numDocs } - ctx, cancel := context.WithCancelCause(ctx) - defer cancel(nil) - - docChan := make(chan *Document, concurrency*2) var globalErr error globalErrLock := sync.Mutex{} + ctx, cancel := context.WithCancelCause(ctx) + defer cancel(nil) + setGlobalErr := func(err error) { + globalErrLock.Lock() + defer globalErrLock.Unlock() + // Another goroutine might have already set the error. + if globalErr == nil { + globalErr = err + // Cancel the operation for all other goroutines. + cancel(globalErr) + } + } wg := sync.WaitGroup{} + docChan := make(chan *Document, concurrency*2) for i := 0; i < concurrency; i++ { wg.Add(1) go func() { @@ -129,14 +139,7 @@ func calcDocSimilarity(ctx context.Context, queryVectors []float32, docs []*Docu sim, err := cosineSimilarity(queryVectors, doc.Embedding) if err != nil { - globalErrLock.Lock() - defer globalErrLock.Unlock() - // Another goroutine might have already set the error. - if globalErr == nil { - globalErr = err - // Cancel the operation for all other goroutines. - cancel(globalErr) - } + setGlobalErr(fmt.Errorf("couldn't calculate similarity for document '%s': %w", doc.ID, err)) return } From 4b5f88df5909152b404df4ca5b98f3039af48727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Mon, 4 Mar 2024 19:39:43 +0100 Subject: [PATCH 4/5] Use new Go-idiomatic methods in example --- example/main.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/example/main.go b/example/main.go index 6068ee7..35e710d 100644 --- a/example/main.go +++ b/example/main.go @@ -54,6 +54,7 @@ func main() { } // Add docs to the collection, if the collection was just created (and not // loaded from persistent storage). + docs := []chromem.Document{} if collection.Count() == 0 { // Here we use a DBpedia sample, where each line contains the lead section/introduction // to some Wikipedia article and its category. @@ -62,9 +63,6 @@ func main() { panic(err) } d := json.NewDecoder(f) - var ids []string - var metadatas []map[string]string - var texts []string log.Println("Reading JSON lines...") for i := 1; ; i++ { var article struct { @@ -78,12 +76,14 @@ func main() { panic(err) } - ids = append(ids, strconv.Itoa(i)) - metadatas = append(metadatas, map[string]string{"category": article.Category}) - texts = append(texts, article.Text) + docs = append(docs, chromem.Document{ + ID: strconv.Itoa(i), + Metadata: map[string]string{"category": article.Category}, + Content: article.Text, + }) } log.Println("Adding documents to chromem-go, including creating their embeddings via Ollama API...") - err = collection.AddConcurrently(ctx, ids, nil, metadatas, texts, runtime.NumCPU()) + err = collection.AddDocuments(ctx, docs, runtime.NumCPU()) if err != nil { panic(err) } From cb0fe2f8e48c7513a550867d11513963bc9958c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Gill=C3=A9?= Date: Mon, 4 Mar 2024 20:19:23 +0100 Subject: [PATCH 5/5] Rename some "document" to "content" or "text" To differentiate between our now exported Document struct and its contents. --- collection.go | 26 +++++++++++++------------- collection_test.go | 8 ++++---- db.go | 4 ++-- embed_compat.go | 8 ++++---- embed_ollama.go | 8 ++++---- embed_openai.go | 14 +++++++------- embed_openai_test.go | 6 +++--- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/collection.go b/collection.go index 0a1fd64..d3395cd 100644 --- a/collection.go +++ b/collection.go @@ -73,14 +73,14 @@ func newCollection(name string, metadata map[string]string, embed EmbeddingFunc, // // - ids: The ids of the embeddings you wish to add // - embeddings: The embeddings to add. If nil, embeddings will be computed based -// on the documents using the embeddingFunc set for the Collection. Optional. +// on the contents using the embeddingFunc set for the Collection. Optional. // - metadatas: The metadata to associate with the embeddings. When querying, // you can filter on this metadata. Optional. -// - documents: The documents to associate with the embeddings. +// - contents: The contents to associate with the embeddings. // // This is a Chroma-like method. For a more Go-idiomatic one, see [AddDocuments]. -func (c *Collection) Add(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, documents []string) error { - return c.AddConcurrently(ctx, ids, embeddings, metadatas, documents, 1) +func (c *Collection) Add(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, contents []string) error { + return c.AddConcurrently(ctx, ids, embeddings, metadatas, contents, 1) } // AddConcurrently is like Add, but adds embeddings concurrently. @@ -88,12 +88,12 @@ func (c *Collection) Add(ctx context.Context, ids []string, embeddings [][]float // Upon error, concurrently running operations are canceled and the error is returned. // // This is a Chroma-like method. For a more Go-idiomatic one, see [AddDocuments]. -func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, documents []string, concurrency int) error { +func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddings [][]float32, metadatas []map[string]string, contents []string, concurrency int) error { if len(ids) == 0 { return errors.New("ids are empty") } - if len(embeddings) == 0 && len(documents) == 0 { - return errors.New("either embeddings or documents must be filled") + if len(embeddings) == 0 && len(contents) == 0 { + return errors.New("either embeddings or contents must be filled") } if len(embeddings) != 0 { if len(embeddings) != len(ids) { @@ -104,15 +104,15 @@ func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddin embeddings = make([][]float32, len(ids)) } if len(metadatas) != 0 && len(ids) != len(metadatas) { - return errors.New("ids, metadatas and documents must have the same length") + return errors.New("ids, metadatas and contents must have the same length") } - if len(documents) != 0 { - if len(documents) != len(ids) { - return errors.New("ids and documents must have the same length") + if len(contents) != 0 { + if len(contents) != len(ids) { + return errors.New("ids and contents must have the same length") } } else { // Assign empty slice so we can simply access via index later - documents = make([]string, len(ids)) + contents = make([]string, len(ids)) } if concurrency < 1 { return errors.New("concurrency must be at least 1") @@ -125,7 +125,7 @@ func (c *Collection) AddConcurrently(ctx context.Context, ids []string, embeddin ID: id, Metadata: metadatas[i], Embedding: embeddings[i], - Content: documents[i], + Content: contents[i], }) } diff --git a/collection_test.go b/collection_test.go index 472e119..8b1059a 100644 --- a/collection_test.go +++ b/collection_test.go @@ -26,8 +26,8 @@ func TestCollection_Add(t *testing.T) { // Add document ids := []string{"1", "2"} metadatas := []map[string]string{{"foo": "bar"}, {"a": "b"}} - documents := []string{"hello world", "hallo welt"} - err = c.Add(context.Background(), ids, nil, metadatas, documents) + contents := []string{"hello world", "hallo welt"} + err = c.Add(context.Background(), ids, nil, metadatas, contents) if err != nil { t.Error("expected nil, got", err) } @@ -54,8 +54,8 @@ func TestCollection_Count(t *testing.T) { // Add documents ids := []string{"1", "2"} metadatas := []map[string]string{{"foo": "bar"}, {"a": "b"}} - documents := []string{"hello world", "hallo welt"} - err = c.Add(context.Background(), ids, nil, metadatas, documents) + contents := []string{"hello world", "hallo welt"} + err = c.Add(context.Background(), ids, nil, metadatas, contents) if err != nil { t.Error("expected nil, got", err) } diff --git a/db.go b/db.go index 4662b9b..42ab698 100644 --- a/db.go +++ b/db.go @@ -10,10 +10,10 @@ import ( "sync" ) -// EmbeddingFunc is a function that creates embeddings for a given document. +// EmbeddingFunc is a function that creates embeddings for a given text. // chromem-go will use OpenAI`s "text-embedding-3-small" model by default, // but you can provide your own function, using any model you like. -type EmbeddingFunc func(ctx context.Context, document string) ([]float32, error) +type EmbeddingFunc func(ctx context.Context, text string) ([]float32, error) // DB is the chromem-go database. It holds collections, which hold documents. // diff --git a/embed_compat.go b/embed_compat.go index f82e091..a3d18c9 100644 --- a/embed_compat.go +++ b/embed_compat.go @@ -6,7 +6,7 @@ const ( embeddingModelMistral = "mistral-embed" ) -// NewEmbeddingFuncMistral returns a function that creates embeddings for a document +// NewEmbeddingFuncMistral returns a function that creates embeddings for a text // using the Mistral API. func NewEmbeddingFuncMistral(apiKey string) EmbeddingFunc { // The Mistral API docs don't mention the `encoding_format` as optional, @@ -25,7 +25,7 @@ const ( EmbeddingModelJina2BaseZH EmbeddingModelJina = "jina-embeddings-v2-base-zh" ) -// NewEmbeddingFuncJina returns a function that creates embeddings for a document +// NewEmbeddingFuncJina returns a function that creates embeddings for a text // using the Jina API. func NewEmbeddingFuncJina(apiKey string, model EmbeddingModelJina) EmbeddingFunc { return NewEmbeddingFuncOpenAICompat(baseURLJina, apiKey, string(model)) @@ -46,7 +46,7 @@ const ( EmbeddingModelMixedbreadGTELargeZh EmbeddingModelMixedbread = "gte-large-zh" ) -// NewEmbeddingFuncMixedbread returns a function that creates embeddings for a document +// NewEmbeddingFuncMixedbread returns a function that creates embeddings for a text // using the mixedbread.ai API. func NewEmbeddingFuncMixedbread(apiKey string, model EmbeddingModelMixedbread) EmbeddingFunc { return NewEmbeddingFuncOpenAICompat(baseURLMixedbread, apiKey, string(model)) @@ -54,7 +54,7 @@ func NewEmbeddingFuncMixedbread(apiKey string, model EmbeddingModelMixedbread) E const baseURLLocalAI = "http://localhost:8080/v1" -// NewEmbeddingFuncLocalAI returns a function that creates embeddings for a document +// NewEmbeddingFuncLocalAI returns a function that creates embeddings for a text // using the LocalAI API. // You can start a LocalAI instance like this: // diff --git a/embed_ollama.go b/embed_ollama.go index aa14905..2e5f718 100644 --- a/embed_ollama.go +++ b/embed_ollama.go @@ -16,21 +16,21 @@ type ollamaResponse struct { Embedding []float32 `json:"embedding"` } -// NewEmbeddingFuncOllama returns a function that creates embeddings for a document +// NewEmbeddingFuncOllama returns a function that creates embeddings for a text // using Ollama's embedding API. You can pass any model that Ollama supports and // that supports embeddings. A good one as of 2024-03-02 is "nomic-embed-text". // See https://ollama.com/library/nomic-embed-text func NewEmbeddingFuncOllama(model string) EmbeddingFunc { // We don't set a default timeout here, although it's usually a good idea. // In our case though, the library user can set the timeout on the context, - // and it might have to be a long timeout, depending on the document size. + // and it might have to be a long timeout, depending on the text length. client := &http.Client{} - return func(ctx context.Context, document string) ([]float32, error) { + return func(ctx context.Context, text string) ([]float32, error) { // Prepare the request body. reqBody, err := json.Marshal(map[string]string{ "model": model, - "prompt": document, + "prompt": text, }) if err != nil { return nil, fmt.Errorf("couldn't marshal request body: %w", err) diff --git a/embed_openai.go b/embed_openai.go index 681db86..2d301c7 100644 --- a/embed_openai.go +++ b/embed_openai.go @@ -27,22 +27,22 @@ type openAIResponse struct { } `json:"data"` } -// NewEmbeddingFuncDefault returns a function that creates embeddings for a document +// NewEmbeddingFuncDefault returns a function that creates embeddings for a text // using OpenAI`s "text-embedding-3-small" model via their API. -// The model supports a maximum document length of 8191 tokens. +// The model supports a maximum text length of 8191 tokens. // The API key is read from the environment variable "OPENAI_API_KEY". func NewEmbeddingFuncDefault() EmbeddingFunc { apiKey := os.Getenv("OPENAI_API_KEY") return NewEmbeddingFuncOpenAI(apiKey, EmbeddingModelOpenAI3Small) } -// NewEmbeddingFuncOpenAI returns a function that creates embeddings for a document +// NewEmbeddingFuncOpenAI returns a function that creates embeddings for a text // using the OpenAI API. func NewEmbeddingFuncOpenAI(apiKey string, model EmbeddingModelOpenAI) EmbeddingFunc { return NewEmbeddingFuncOpenAICompat(BaseURLOpenAI, apiKey, string(model)) } -// NewEmbeddingFuncOpenAICompat returns a function that creates embeddings for a document +// NewEmbeddingFuncOpenAICompat returns a function that creates embeddings for a text // using an OpenAI compatible API. For example: // - Azure OpenAI: https://azure.microsoft.com/en-us/products/ai-services/openai-service // - LitLLM: https://github.com/BerriAI/litellm @@ -51,13 +51,13 @@ func NewEmbeddingFuncOpenAI(apiKey string, model EmbeddingModelOpenAI) Embedding func NewEmbeddingFuncOpenAICompat(baseURL, apiKey, model string) EmbeddingFunc { // We don't set a default timeout here, although it's usually a good idea. // In our case though, the library user can set the timeout on the context, - // and it might have to be a long timeout, depending on the document size. + // and it might have to be a long timeout, depending on the text length. client := &http.Client{} - return func(ctx context.Context, document string) ([]float32, error) { + return func(ctx context.Context, text string) ([]float32, error) { // Prepare the request body. reqBody, err := json.Marshal(map[string]string{ - "input": document, + "input": text, "model": model, }) if err != nil { diff --git a/embed_openai_test.go b/embed_openai_test.go index af729b5..70c62f9 100644 --- a/embed_openai_test.go +++ b/embed_openai_test.go @@ -24,10 +24,10 @@ func TestNewEmbeddingFuncOpenAICompat(t *testing.T) { apiKey := "secret" model := "model-small" baseURLSuffix := "/v1" - document := "hello world" + input := "hello world" wantBody, err := json.Marshal(map[string]string{ - "input": document, + "input": input, "model": model, }) if err != nil { @@ -76,7 +76,7 @@ func TestNewEmbeddingFuncOpenAICompat(t *testing.T) { baseURL := ts.URL + baseURLSuffix f := chromem.NewEmbeddingFuncOpenAICompat(baseURL, apiKey, model) - res, err := f(context.Background(), document) + res, err := f(context.Background(), input) if err != nil { t.Error("expected nil, got", err) }