From 3385b537ed78697cceeb600e2b4ffdcefb89bc75 Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Thu, 5 Jun 2025 12:57:19 -0700 Subject: [PATCH 1/6] Initial CouchbaseQueryVectorStore --- .../src/vectorstores/couchbase_query.ts | 732 ++++++++++++++++++ 1 file changed, 732 insertions(+) create mode 100644 libs/langchain-community/src/vectorstores/couchbase_query.ts diff --git a/libs/langchain-community/src/vectorstores/couchbase_query.ts b/libs/langchain-community/src/vectorstores/couchbase_query.ts new file mode 100644 index 000000000000..aad8e747f8bb --- /dev/null +++ b/libs/langchain-community/src/vectorstores/couchbase_query.ts @@ -0,0 +1,732 @@ +/* eslint-disable no-param-reassign */ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/* eslint-disable import/no-extraneous-dependencies */ +import { EmbeddingsInterface } from "@langchain/core/embeddings"; +import { VectorStore } from "@langchain/core/vectorstores"; +import { + Bucket, + Cluster, + Collection, + Scope, +} from "couchbase"; +import { Document } from "@langchain/core/documents"; +import { v4 as uuid } from "uuid"; + +/** + * Enum for different distance strategies supported by Couchbase vector search + */ +export enum DistanceStrategy { + DOT = "dot", + L2 = "l2", + EUCLIDEAN = "euclidean", + COSINE = "cosine", + L2_SQUARED = "l2_squared", + EUCLIDEAN_SQUARED = "euclidean_squared", +} + +export enum IndexType { + COMPOSITE = "composite", + BHIVE = "bhive", +} + +/** + * Interface for create_index method parameters + */ +export interface CreateIndexOptions { + indexType: IndexType; + indexDescription: string; + distanceMetric?: DistanceStrategy; + indexName?: string; + vectorField?: string; + vectorDimension?: number; + fields?: string[]; + whereClause?: string; + indexScanNprobes?: number; + indexTrainlist?: number; +} + +/** + * This interface define the optional fields for adding vector + * - `ids` - vector of ids for each document. If undefined, then uuid will be used + * - `metadata` - vector of metadata object for each document + */ +export interface AddVectorOptions { + ids?: string[]; + metadata?: Record[]; +} + +/** + * This interface defines the fields required to initialize a query vector store + * These are the fields part of config: + * @property {Cluster} cluster - The Couchbase cluster that the store will interact with. + * @property {string} bucketName - The name of the bucket in the Couchbase cluster. + * @property {string} scopeName - The name of the scope within the bucket. + * @property {string} collectionName - The name of the collection within the scope. + * @property {string} textKey - The key to be used for text in the documents. Defaults to "text". + * @property {string} embeddingKey - The key to be used for embeddings in the documents. Defaults to "embedding". + * @property {DistanceStrategy} distanceStrategy - The distance strategy to use for vector similarity calculations. Defaults to DOT. + * @property {AddVectorOptions} addVectorOptions - Options for adding vectors with specific id/metadata + */ +export interface CouchbaseQueryVectorStoreArgs { + cluster: Cluster; + bucketName: string; + scopeName: string; + collectionName: string; + textKey?: string; + embeddingKey?: string; + distanceStrategy?: DistanceStrategy; + addVectorOptions?: AddVectorOptions; +} + +/** + * This type defines the search filters used in couchbase query vector search + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + */ +type CouchbaseQueryVectorStoreFilter = { + where?: string; + fields?: string[]; +}; + +/** + * Class for interacting with the Couchbase database using Query service for vector search. + * It extends the VectorStore class and provides methods for adding vectors and + * documents, and searching for similar vectors using SQL++ queries. + * Initiate the class using initialize() method. + */ +export class CouchbaseQueryVectorStore extends VectorStore { + declare FilterType: CouchbaseQueryVectorStoreFilter; + + private metadataKey = "metadata"; + + private readonly defaultTextKey = "text"; + + private readonly defaultEmbeddingKey = "embedding"; + + private readonly defaultDistanceStrategy = DistanceStrategy.DOT; + + private cluster: Cluster; + + private _bucket: Bucket; + + private _scope: Scope; + + private _collection: Collection; + + private bucketName: string; + + private scopeName: string; + + private collectionName: string; + + private textKey = this.defaultTextKey; + + private embeddingKey = this.defaultEmbeddingKey; + + private distanceStrategy = this.defaultDistanceStrategy; + + /** + * The private constructor used to provide embedding to parent class. + * Initialize the class using static initialize() method + * @param embedding - object to generate embedding + * @param config - the fields required to initialize a vector store + */ + private constructor( + embedding: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ) { + super(embedding, config); + } + + _vectorstoreType(): string { + return "couchbase_query"; + } + + /** + * initialize class for interacting with the Couchbase database using Query service. + * It extends the VectorStore class and provides methods + * for adding vectors and documents, and searching for similar vectors. + * This also verifies the params + * + * @param embeddings - object to generate embedding + * @param config - the fields required to initialize a vector store + */ + static async initialize( + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ) { + const store = new CouchbaseQueryVectorStore(embeddings, config); + + const { + cluster, + bucketName, + scopeName, + collectionName, + textKey, + embeddingKey, + distanceStrategy, + } = config; + + store.cluster = cluster; + store.bucketName = bucketName; + store.scopeName = scopeName; + store.collectionName = collectionName; + if (textKey) { + store.textKey = textKey; + } else { + store.textKey = store.defaultTextKey; + } + + if (embeddingKey) { + store.embeddingKey = embeddingKey; + } else { + store.embeddingKey = store.defaultEmbeddingKey; + } + + if (distanceStrategy) { + store.distanceStrategy = distanceStrategy; + } else { + store.distanceStrategy = store.defaultDistanceStrategy; + } + + try { + store._bucket = store.cluster.bucket(store.bucketName); + store._scope = store._bucket.scope(store.scopeName); + store._collection = store._scope.collection(store.collectionName); + } catch (err) { + throw new Error( + "Error connecting to couchbase, Please check connection and credentials" + ); + } + + try { + if ( + !(await store.checkBucketExists()) || + !(await store.checkScopeAndCollectionExists()) + ) { + throw new Error("Error while initializing vector store"); + } + } catch (err) { + throw new Error(`Error while initializing vector store: ${err}`); + } + return store; + } + + /** + * An asynchronous method to verify the bucket exists. + * It retrieves bucket information and checks if the bucket is present. + * + * @throws - If the specified bucket does not exist in the database. + * + * @returns - returns promise true if no error is found + */ + private async checkBucketExists(): Promise { + try { + await this.cluster.buckets().getBucket(this.bucketName); + return true; + } catch (err) { + throw new Error( + `Bucket with name ${this.bucketName} does not exist. Error: ${err}` + ); + } + } + + /** + * An asynchronous method to verify the scope and collection exist. + * It checks if the specified scope and collection are present. + * + * @throws - If the specified scope or collection does not exist in the database. + * + * @returns - returns promise true if no error is found + */ + private async checkScopeAndCollectionExists(): Promise { + try { + const scopes = await this._bucket.collections().getAllScopes(); + const scope = scopes.find((s: any) => s.name === this.scopeName); + if (!scope) { + throw new Error(`Scope ${this.scopeName} does not exist`); + } + + const collection = scope.collections.find( + (c: any) => c.name === this.collectionName + ); + if (!collection) { + throw new Error(`Collection ${this.collectionName} does not exist`); + } + + return true; + } catch (err) { + throw new Error( + `Scope ${this.scopeName} or Collection ${this.collectionName} does not exist. Error: ${err}` + ); + } + } + + /** + * Method to add vectors and documents to the vector store. + * + * @param vectors - Vectors to be added to the vector store. + * @param documents - Documents to be added to the vector store. + * @param options - Optional parameters for adding vectors. + * + * @returns - Promise that resolves to an array of document IDs. + */ + async addVectors( + vectors: number[][], + documents: Document[], + options?: AddVectorOptions + ): Promise { + if (vectors.length === 0) { + return []; + } + + if (vectors.length !== documents.length) { + throw new Error("Vectors and documents must have the same length"); + } + + const documentIds = options?.ids || documents.map(() => uuid()); + const documentsToInsert: { [key: string]: any }[] = []; + + for (let index = 0; index < vectors.length; index += 1) { + const vector = vectors[index]; + const document = documents[index]; + const documentId = documentIds[index]; + + const documentToInsert = { + [documentId]: { + [this.textKey]: document.pageContent, + [this.embeddingKey]: vector, + [this.metadataKey]: document.metadata, + }, + }; + + documentsToInsert.push(documentToInsert); + } + + const docIds = await this.upsertDocuments(documentsToInsert); + return docIds; + } + + /** + * Method to add documents to the vector store. It first converts + * the documents to vectors using the embeddings and then adds them to the vector store. + * + * @param documents - Documents to be added to the vector store. + * @param options - Optional parameters for adding documents. + * + * @returns - Promise that resolves to an array of document IDs. + */ + async addDocuments( + documents: Document[], + options?: AddVectorOptions + ): Promise { + const texts = documents.map(({ pageContent }) => pageContent); + const vectors = await this.embeddings.embedDocuments(texts); + return this.addVectors(vectors, documents, options); + } + + /** + * Method to delete documents from the vector store. + * + * @param ids - Array of document IDs to be deleted. + * + * @returns - Promise that resolves when the deletion is complete. + */ + async delete(options: { ids: string[] }): Promise { + const { ids } = options; + const deletePromises = ids.map((id) => + this._collection.remove(id).catch((e: any) => { + console.error("error received while deleting document", e); + throw new Error(`Delete failed with error: ${e}`); + }) + ); + + try { + await Promise.all(deletePromises); + } catch (e) { + console.error( + "An error occurred with Promise.all at deleting all documents", + e + ); + throw e; + } + } + + /** + * Return documents that are most similar to the vector embedding using SQL++ query. + * + * @param queryEmbeddings - Embedding vector to look up documents similar to. + * @param k - Number of documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of [document, score] that are the most similar to the query vector. + * + * @throws If the search operation fails. + */ + async similaritySearchVectorWithScore( + queryEmbeddings: number[], + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise<[Document, number][]> { + const { where, fields } = filter; + + // Build the SELECT clause + let selectClause = `META().id, ${this.textKey}, ${this.metadataKey}`; + if (fields && fields.length > 0) { + selectClause = fields.join(", "); + if (!fields.includes(this.textKey)) { + selectClause += `, ${this.textKey}`; + } + if (!fields.includes(this.metadataKey)) { + selectClause += `, ${this.metadataKey}`; + } + if (!fields.includes("META().id")) { + selectClause += `, META().id`; + } + } + + // Build the WHERE clause + let whereClause = ""; + if (where) { + whereClause = `AND ${where}`; + } + + // Build the SQL++ query with vector search using APPROX_VECTOR_DISTANCE function + // Using the configured distance metric for similarity scoring + + const distanceMetric = this.distanceStrategy; + const query = ` + SELECT ${selectClause}, + APPROX_VECTOR_DISTANCE(${this.embeddingKey}, [${queryEmbeddings}], "${distanceMetric}") as distance + FROM \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` + WHERE ${this.embeddingKey} IS NOT NULL ${whereClause} + ORDER BY APPROX_VECTOR_DISTANCE(${this.embeddingKey}, [${queryEmbeddings}], "${distanceMetric}") + LIMIT ${k} + `; + + const docsWithScore: [Document, number][] = []; + try { + const result = await this.cluster.query(query, { + parameters: { + queryVector: queryEmbeddings, + k, + }, + }); + + for (const row of result.rows) { + const text = row[this.textKey]; + const metadata = row[this.metadataKey] || {}; + // Convert distance to similarity score (lower distance = higher similarity) + const distance = row.distance || 0; + const doc = new Document({ + pageContent: text, + metadata, + }); + docsWithScore.push([doc, distance]); + } + } catch (err) { + console.log("error received"); + throw new Error(`Query failed with error: ${err}`); + } + return docsWithScore; + } + + /** + * Return documents that are most similar to the vector embedding. + * + * @param queryEmbeddings - Embedding to look up documents similar to. + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - A promise that resolves to an array of documents that match the similarity search. + */ + async similaritySearchByVector( + queryEmbeddings: number[], + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise { + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + return docs; + } + + /** + * Return documents that are most similar to the query. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of documents that are most similar to the query. + */ + async similaritySearch( + query: string, + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise { + const queryEmbeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + return docs; + } + + /** + * Return documents that are most similar to the query with their scores. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of documents that are most similar to the query. + */ + async similaritySearchWithScore( + query: string, + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise<[Document, number][]> { + const queryEmbeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + return docsWithScore; + } + + /** + * upsert documents asynchronously into a couchbase collection + * @param documentsToInsert Documents to be inserted into couchbase collection with embeddings, original text and metadata + * @returns DocIds of the inserted documents + */ + private async upsertDocuments( + documentsToInsert: { + [x: string]: any; + }[] + ) { + // Create promises for each document to be upserted + const upsertDocumentsPromises = documentsToInsert.map((document) => { + const currentDocumentKey = Object.keys(document)[0]; + return this._collection + .upsert(currentDocumentKey, document[currentDocumentKey]) + .then(() => currentDocumentKey) + .catch((e: any) => { + console.error("error received while upserting document", e); + throw new Error(`Upsert failed with error: ${e}`); + }); + }); + + try { + // Upsert all documents asynchronously + const docIds = await Promise.all(upsertDocumentsPromises); + const successfulDocIds: string[] = []; + for (const id of docIds) { + if (id) { + successfulDocIds.push(id); + } + } + return successfulDocIds; + } catch (e) { + console.error( + "An error occurred with Promise.all at upserting all documents", + e + ); + throw e; + } + } + + /** + * Create a new vector index for the Query vector store. + * + * @param options - Configuration options for creating the index + * @param options.indexType - Type of the index (BHIVE or COMPOSITE) to create + * @param options.indexDescription - Description of the index like "IVF,SQ8" + * @param options.distanceMetric - Distance metric to use for the index. Defaults to the distance metric in the constructor + * @param options.indexName - Name of the index to create. Defaults to "langchain_{indexType}_query_index" + * @param options.vectorField - Name of the vector field to use for the index. Defaults to the embedding key in the constructor + * @param options.vectorDimension - Dimension of the vector field. If not provided, it will be determined from the embedding object + * @param options.fields - List of fields to include in the index. Defaults to the text field in the constructor + * @param options.whereClause - Optional where clause to filter the documents to index + * @param options.indexScanNprobes - Number of probes to use for the index + * @param options.indexTrainlist - Number of training samples to use for the index + * + * @throws {Error} If index creation fails or invalid parameters are provided + */ + async createIndex(options: CreateIndexOptions): Promise { + const { + indexType, + indexDescription, + distanceMetric, + indexName, + vectorField, + vectorDimension, + fields, + whereClause, + indexScanNprobes, + indexTrainlist, + } = options; + + if (!Object.values(IndexType).includes(indexType)) { + throw new Error( + `Invalid index type. Got ${indexType}. Expected one of: ${Object.values(IndexType).join(", ")}` + ); + } + + if (!indexDescription) { + throw new Error( + "Index description is required for creating Vector Query index." + ); + } + + const similarityMetric = distanceMetric || this.distanceStrategy; + const vectorFieldName = vectorField || this.embeddingKey; + + // Get the vector dimension for the index + let vectorDim = vectorDimension; + if (!vectorDim) { + try { + const testEmbedding = await this.embeddings.embedQuery( + "check the size of the vector embeddings" + ); + vectorDim = testEmbedding.length; + } catch (e) { + throw new Error( + "Vector dimension is required for creating Query index. " + + "Unable to determine the dimension from the embedding object. " + + `Error: ${e}` + ); + } + } + + // Create the index parameters for the index creation query + const indexParams: Record = { + dimension: vectorDim, + similarity: similarityMetric, + description: indexDescription, + }; + + if (indexScanNprobes) { + indexParams.scan_nprobes = indexScanNprobes; + } + if (indexTrainlist) { + indexParams.trainlist = indexTrainlist; + } + + // Add the text field to the fields if empty or if it is not present + const includeFields = fields || [this.textKey]; + if (!includeFields.includes(this.textKey)) { + includeFields.push(this.textKey); + } + + // Build where clause if provided + const whereClauseStr = whereClause ? `WHERE ${whereClause}` : ""; + + // Convert index params to WITH clause format + const withClause = `WITH ${JSON.stringify(indexParams).replace(/"/g, "'")}`; + + let indexQuery: string; + let finalIndexName: string; + + if (indexType === IndexType.BHIVE) { + finalIndexName = indexName || "langchain_bhive_query_index"; + // BHIVE: Specialized vector index with INCLUDE clause for additional fields + indexQuery = `CREATE VECTOR INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(\`${vectorFieldName}\` VECTOR) INCLUDE (${includeFields.map(f => `\`${f}\``).join(", ")}) ` + + `${whereClauseStr} USING GSI ${withClause}`; + } else if (indexType === IndexType.COMPOSITE) { + finalIndexName = indexName || "langchain_composite_query_index"; + // COMPOSITE: General GSI index that includes vector field alongside other fields with VECTOR keyword + indexQuery = `CREATE INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(${includeFields.map(f => `\`${f}\``).join(", ")}, \`${vectorFieldName}\` VECTOR) ` + + `${whereClauseStr} USING GSI ${withClause}`; + } else { + throw new Error(`Unsupported index type: ${indexType}`); + } + + console.log(indexQuery); + + try { + await this.cluster.query(indexQuery); + console.log(`Successfully created ${indexType} index: ${finalIndexName}`); + } catch (e) { + console.log(e); + if (e && typeof e === 'object' && 'cause' in e && e.cause && typeof e.cause === 'object' && 'first_error_message' in e.cause) { console.log("YO!"); + throw new Error(`Index creation failed with error: ${e.cause.first_error_message}`); + } + throw new Error(`Index creation failed with error: ${e}`); + } + } + + /** + * Static method to create a new CouchbaseQueryVectorStore from an array of texts. + * It first converts the texts to vectors using the embeddings and then creates a new vector store. + * + * @param texts - Array of texts to be converted to vectors. + * @param metadatas - Array of metadata objects corresponding to the texts. + * @param embeddings - Embeddings to be used for converting texts to vectors. + * @param config - Configuration for the vector store. + * + * @returns - Promise that resolves to a new CouchbaseQueryVectorStore instance. + */ + static async fromTexts( + texts: string[], + metadatas: object[] | object, + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ): Promise { + const docs: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return CouchbaseQueryVectorStore.fromDocuments(docs, embeddings, config); + } + + /** + * Static method to create a new CouchbaseQueryVectorStore from an array of documents. + * It first converts the documents to vectors using the embeddings and then creates a new vector store. + * + * @param docs - Array of documents to be converted to vectors. + * @param embeddings - Embeddings to be used for converting documents to vectors. + * @param config - Configuration for the vector store. + * + * @returns - Promise that resolves to a new CouchbaseQueryVectorStore instance. + */ + static async fromDocuments( + docs: Document[], + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ): Promise { + const instance = await CouchbaseQueryVectorStore.initialize( + embeddings, + config + ); + await instance.addDocuments(docs); + return instance; + } +} From 8081ccbc96362764411f39b365aa2dd097ec49b3 Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Mon, 9 Jun 2025 13:48:18 -0700 Subject: [PATCH 2/6] Cleanup Errors, Format + Lint --- .../src/vectorstores/couchbase_query.ts | 87 ++++++++----------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/couchbase_query.ts b/libs/langchain-community/src/vectorstores/couchbase_query.ts index aad8e747f8bb..4a0ceb3a07a4 100644 --- a/libs/langchain-community/src/vectorstores/couchbase_query.ts +++ b/libs/langchain-community/src/vectorstores/couchbase_query.ts @@ -3,12 +3,7 @@ /* eslint-disable import/no-extraneous-dependencies */ import { EmbeddingsInterface } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; -import { - Bucket, - Cluster, - Collection, - Scope, -} from "couchbase"; +import { Bucket, Cluster, Collection, Scope } from "couchbase"; import { Document } from "@langchain/core/documents"; import { v4 as uuid } from "uuid"; @@ -336,20 +331,11 @@ export class CouchbaseQueryVectorStore extends VectorStore { const { ids } = options; const deletePromises = ids.map((id) => this._collection.remove(id).catch((e: any) => { - console.error("error received while deleting document", e); throw new Error(`Delete failed with error: ${e}`); }) ); - try { - await Promise.all(deletePromises); - } catch (e) { - console.error( - "An error occurred with Promise.all at deleting all documents", - e - ); - throw e; - } + await Promise.all(deletePromises); } /** @@ -427,7 +413,6 @@ export class CouchbaseQueryVectorStore extends VectorStore { docsWithScore.push([doc, distance]); } } catch (err) { - console.log("error received"); throw new Error(`Query failed with error: ${err}`); } return docsWithScore; @@ -532,33 +517,24 @@ export class CouchbaseQueryVectorStore extends VectorStore { .upsert(currentDocumentKey, document[currentDocumentKey]) .then(() => currentDocumentKey) .catch((e: any) => { - console.error("error received while upserting document", e); throw new Error(`Upsert failed with error: ${e}`); }); }); - try { - // Upsert all documents asynchronously - const docIds = await Promise.all(upsertDocumentsPromises); - const successfulDocIds: string[] = []; - for (const id of docIds) { - if (id) { - successfulDocIds.push(id); - } + // Upsert all documents asynchronously + const docIds = await Promise.all(upsertDocumentsPromises); + const successfulDocIds: string[] = []; + for (const id of docIds) { + if (id) { + successfulDocIds.push(id); } - return successfulDocIds; - } catch (e) { - console.error( - "An error occurred with Promise.all at upserting all documents", - e - ); - throw e; } + return successfulDocIds; } /** * Create a new vector index for the Query vector store. - * + * * @param options - Configuration options for creating the index * @param options.indexType - Type of the index (BHIVE or COMPOSITE) to create * @param options.indexDescription - Description of the index like "IVF,SQ8" @@ -570,7 +546,7 @@ export class CouchbaseQueryVectorStore extends VectorStore { * @param options.whereClause - Optional where clause to filter the documents to index * @param options.indexScanNprobes - Number of probes to use for the index * @param options.indexTrainlist - Number of training samples to use for the index - * + * * @throws {Error} If index creation fails or invalid parameters are provided */ async createIndex(options: CreateIndexOptions): Promise { @@ -589,7 +565,9 @@ export class CouchbaseQueryVectorStore extends VectorStore { if (!Object.values(IndexType).includes(indexType)) { throw new Error( - `Invalid index type. Got ${indexType}. Expected one of: ${Object.values(IndexType).join(", ")}` + `Invalid index type. Got ${indexType}. Expected one of: ${Object.values( + IndexType + ).join(", ")}` ); } @@ -613,8 +591,8 @@ export class CouchbaseQueryVectorStore extends VectorStore { } catch (e) { throw new Error( "Vector dimension is required for creating Query index. " + - "Unable to determine the dimension from the embedding object. " + - `Error: ${e}` + "Unable to determine the dimension from the embedding object. " + + `Error: ${e}` ); } } @@ -651,28 +629,39 @@ export class CouchbaseQueryVectorStore extends VectorStore { if (indexType === IndexType.BHIVE) { finalIndexName = indexName || "langchain_bhive_query_index"; // BHIVE: Specialized vector index with INCLUDE clause for additional fields - indexQuery = `CREATE VECTOR INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + - `(\`${vectorFieldName}\` VECTOR) INCLUDE (${includeFields.map(f => `\`${f}\``).join(", ")}) ` + + indexQuery = + `CREATE VECTOR INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(\`${vectorFieldName}\` VECTOR) INCLUDE (${includeFields + .map((f) => `\`${f}\``) + .join(", ")}) ` + `${whereClauseStr} USING GSI ${withClause}`; } else if (indexType === IndexType.COMPOSITE) { finalIndexName = indexName || "langchain_composite_query_index"; // COMPOSITE: General GSI index that includes vector field alongside other fields with VECTOR keyword - indexQuery = `CREATE INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + - `(${includeFields.map(f => `\`${f}\``).join(", ")}, \`${vectorFieldName}\` VECTOR) ` + + indexQuery = + `CREATE INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(${includeFields + .map((f) => `\`${f}\``) + .join(", ")}, \`${vectorFieldName}\` VECTOR) ` + `${whereClauseStr} USING GSI ${withClause}`; } else { throw new Error(`Unsupported index type: ${indexType}`); } - console.log(indexQuery); - try { await this.cluster.query(indexQuery); - console.log(`Successfully created ${indexType} index: ${finalIndexName}`); } catch (e) { - console.log(e); - if (e && typeof e === 'object' && 'cause' in e && e.cause && typeof e.cause === 'object' && 'first_error_message' in e.cause) { console.log("YO!"); - throw new Error(`Index creation failed with error: ${e.cause.first_error_message}`); + if ( + e && + typeof e === "object" && + "cause" in e && + e.cause && + typeof e.cause === "object" && + "first_error_message" in e.cause + ) { + throw new Error( + `Index creation failed with error: ${e.cause.first_error_message}` + ); } throw new Error(`Index creation failed with error: ${e}`); } @@ -729,4 +718,4 @@ export class CouchbaseQueryVectorStore extends VectorStore { await instance.addDocuments(docs); return instance; } -} +} From 078859d5af9494d236baebff8bd9e5b5c8ad00ee Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Mon, 9 Jun 2025 13:50:17 -0700 Subject: [PATCH 3/6] Tests --- .../tests/couchbase_query.test.ts | 562 ++++++++++++++++++ 1 file changed, 562 insertions(+) create mode 100644 libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts new file mode 100644 index 000000000000..50560ab7e239 --- /dev/null +++ b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts @@ -0,0 +1,562 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/* eslint-disable no-process-env */ +import { + describe, + test, + beforeEach, + afterAll, + expect, + beforeAll, +} from "@jest/globals"; +import { Cluster } from "couchbase"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { Document } from "@langchain/core/documents"; +import { faker } from "@faker-js/faker"; + +import { + CouchbaseQueryVectorStore, + CouchbaseQueryVectorStoreArgs, + DistanceStrategy, + IndexType, +} from "../couchbase_query.js"; + +// Helper function to delay execution +const delay = (ms: number) => + new Promise((resolve) => { + setTimeout(resolve, ms); + }); + +describe.skip("CouchbaseQueryVectorStore", () => { + // Configuration + const config = { + cluster: process.env.COUCHBASE_CLUSTER || "couchbase://localhost", + username: process.env.COUCHBASE_USERNAME || "Administrator", + password: process.env.COUCHBASE_PASSWORD || "password", + bucketName: "test-bucket", + indexTestBucketName: "test-index-bucket", + scopeName: "_default", + collectionName: "_default", + textKey: "text", + embeddingKey: "embedding", + distanceStrategy: DistanceStrategy.COSINE, + }; + + let cluster: Cluster; + let store: CouchbaseQueryVectorStore; + let indexTestStore: CouchbaseQueryVectorStore; + let embeddings: OpenAIEmbeddings; + + beforeAll(async () => { + // Create embeddings instance + embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, + }); + + // Connect to Couchbase + cluster = await Cluster.connect(config.cluster, { + username: config.username, + password: config.password, + }); + + // Create bucket if it doesn't exist + try { + const buckets = await cluster.buckets().getAllBuckets(); + if (!buckets.some((bucket) => bucket.name === config.bucketName)) { + await cluster.buckets().createBucket({ + name: config.bucketName, + ramQuotaMB: 2000, + flushEnabled: true, + }); + } + // create a separate bucket for index testing + if ( + !buckets.some((bucket) => bucket.name === config.indexTestBucketName) + ) { + await cluster.buckets().createBucket({ + name: config.indexTestBucketName, + ramQuotaMB: 2000, + flushEnabled: true, + }); + } + } catch (err: any) { + if (err.code !== 605) { + // 605 is bucket_exists error + console.error("Error creating bucket:", err); + throw err; + } + } + + await delay(5000); + }); + + beforeEach(async () => { + await cluster.buckets().flushBucket(config.bucketName); + + // Initialize store + try { + const storeConfig: CouchbaseQueryVectorStoreArgs = { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + }; + + store = await CouchbaseQueryVectorStore.initialize( + embeddings, + storeConfig + ); + + const indexTestStoreConfig: CouchbaseQueryVectorStoreArgs = { + cluster, + bucketName: config.indexTestBucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + }; + + indexTestStore = await CouchbaseQueryVectorStore.initialize( + embeddings, + indexTestStoreConfig + ); + } catch (error) { + console.error("Failed to initialize test suite:", error); + throw error; + } + }); + + afterAll(async () => { + if (cluster) { + await cluster.buckets().flushBucket(config.bucketName); + await cluster.close(); + } + }); + + // Helper function to create test data + const createTestData = (count: number) => { + const texts = Array.from({ length: count }, () => faker.lorem.paragraph()); + const metadatas = Array.from({ length: count }, () => ({ + source: faker.system.fileName(), + author: faker.person.fullName(), + })); + return { texts, metadatas }; + }; + + // Helper function to create bulk test data for index training + const createBulkTestData = (count: number) => { + const documents = []; + for (let i = 0; i < count; i += 1) { + documents.push( + new Document({ + pageContent: `Document ${i}: ${faker.hacker.phrase()}! ${faker.company.catchPhrase()}`, + metadata: { + source: "bulk_test", + index: i, + category: faker.helpers.arrayElement([ + "tech", + "business", + "science", + "art", + ]), + rating: faker.number.int({ min: 1, max: 5 }), + }, + }) + ); + } + return documents; + }; + + // Helper function to add documents in batches for better performance + const addDocumentsInBatches = async ( + documents: Document[], + batchSize = 50 + ) => { + const allIds = []; + for (let i = 0; i < documents.length; i += batchSize) { + const batch = documents.slice(i, i + batchSize); + const ids = await indexTestStore.addDocuments(batch); + allIds.push(...ids); + + // Small delay between batches to avoid overwhelming the system + if (i + batchSize < documents.length) { + await delay(100); + } + } + return allIds; + }; + + describe("Initialization", () => { + test("should initialize with default values", async () => { + expect(store).toBeDefined(); + expect(store.embeddings).toBeDefined(); + }); + }); + + describe("Document Operations", () => { + test("should add documents with metadata", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + const ids = await store.addDocuments(documents); + expect(ids).toHaveLength(2); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + + test("should add documents with custom IDs", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + const customIds = ["doc1", "doc2"]; + + const ids = await store.addDocuments(documents, { ids: customIds }); + expect(ids).toEqual(customIds); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should delete documents", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + const ids = await store.addDocuments(documents); + expect(ids).toHaveLength(2); + + await store.delete({ ids }); + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(0); + }); + }); + + describe("Search Operations", () => { + test("should perform similarity search", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should perform similarity search with score", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearchWithScore(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0][0].pageContent).toBe(texts[0]); + expect(typeof results[0][1]).toBe("number"); + }); + + test("should perform similarity search by vector", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const queryEmbedding = await embeddings.embedQuery(texts[0]); + const results = await store.similaritySearchByVector(queryEmbedding, 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should perform similarity search with filters", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearch(texts[0], 1, { + fields: ["text", "metadata.author"], + }); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata.author).toBe(metadatas[0].author); + }); + }); + + describe("Factory Methods", () => { + test("should create store from texts", async () => { + const { texts, metadatas } = createTestData(2); + + const newStore = await CouchbaseQueryVectorStore.fromTexts( + texts, + metadatas, + embeddings, + { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + } + ); + + const results = await newStore.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + + test("should create store from documents", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + const newStore = await CouchbaseQueryVectorStore.fromDocuments( + documents, + embeddings, + { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + } + ); + + const results = await newStore.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + }); + + describe("Index Creation", () => { + const MINIMUM_DOCS_FOR_TRAINING = 1200; // Slightly above the 1024 minimum + let bulkDocumentIds: string[] = []; + + beforeAll(async () => { + // Create bulk test data + const bulkDocuments = createBulkTestData(MINIMUM_DOCS_FOR_TRAINING); + + // Add documents in batches for better performance + bulkDocumentIds = await addDocumentsInBatches(bulkDocuments, 100); + + // Wait a bit for documents to be indexed + await delay(10000); + }); + + afterAll(async () => { + // Clean up bulk documents + if (bulkDocumentIds.length > 0) { + try { + await indexTestStore.delete({ ids: bulkDocumentIds }); + } catch (error) { + console.warn("Error cleaning up bulk documents:", error); + } + } + + // Clean up indexes + await dropAllIndexesWithManager(cluster, config.indexTestBucketName); + }); + + async function dropAllIndexesWithManager( + cluster: Cluster, + bucketName: string + ) { + const queryIndexManager = cluster.queryIndexes(); + + try { + // Get all indexes + const indexes = await queryIndexManager.getAllIndexes(bucketName); + + // Drop all secondary indexes + for (const index of indexes) { + if (!index.isPrimary) { + await queryIndexManager.dropIndex(bucketName, index.name); + } + } + } catch (error) { + console.error("Error:", error); + } + } + + test("should create BHIVE vector index", async () => { + const createBhiveIndexOptions = { + indexType: IndexType.BHIVE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "my_bhive_vector_index", + vectorDimension: 1536, + fields: ["text", "metadata"], + whereClause: "metadata.source = 'bulk_test'", + indexScanNprobes: 10, + }; + + // Test that createIndex doesn't throw an error + await expect( + indexTestStore.createIndex(createBhiveIndexOptions) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should create COMPOSITE vector index", async () => { + const createCompositeIndexOptions = { + indexType: IndexType.COMPOSITE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "my_composite_vector_index", + vectorDimension: 1536, + fields: ["text", "metadata.category"], + whereClause: "metadata.source = 'bulk_test'", + indexScanNprobes: 3, + }; + + // Test that createIndex doesn't throw an error + await expect( + indexTestStore.createIndex(createCompositeIndexOptions) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should create index with minimal options", async () => { + const minimalOptions = { + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + indexName: "minimal_options_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex works with minimal options + await expect( + indexTestStore.createIndex(minimalOptions) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should auto-detect vector dimension from embeddings", async () => { + const optionsWithoutDimension = { + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + indexName: "auto_dimension_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex works without specifying dimension + await expect( + indexTestStore.createIndex(optionsWithoutDimension) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should handle index creation errors gracefully", async () => { + const invalidOptions = { + indexType: IndexType.BHIVE, + indexDescription: "", // Empty description should cause an error + indexName: "invalid_index", + }; + + // Test that createIndex handles errors gracefully + await expect( + indexTestStore.createIndex(invalidOptions) + ).rejects.toThrow(); + }); + + test("should create both BHIVE and COMPOSITE indexes sequentially", async () => { + const createBhiveIndexOptions = { + indexType: IndexType.BHIVE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "sequential_bhive_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + const createCompositeIndexOptions = { + indexType: IndexType.COMPOSITE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "sequential_composite_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test creating both index types sequentially + await expect( + indexTestStore.createIndex(createBhiveIndexOptions) + ).resolves.not.toThrow(); + await delay(3000); + await expect( + indexTestStore.createIndex(createCompositeIndexOptions) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should use default distance strategy when not specified", async () => { + const optionsWithoutDistance = { + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + indexName: "default_distance_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex uses default distance strategy + await expect( + indexTestStore.createIndex(optionsWithoutDistance) + ).resolves.not.toThrow(); + + // Wait a bit for index creation to process + await delay(2000); + }); + + test("should handle different distance strategies", async () => { + const distanceStrategies = [ + DistanceStrategy.DOT, + DistanceStrategy.L2, + DistanceStrategy.EUCLIDEAN, + DistanceStrategy.COSINE, + ]; + + for (let i = 0; i < distanceStrategies.length; i += 1) { + const options = { + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + distanceMetric: distanceStrategies[i], + indexName: `distance_test_index_${i}`, + whereClause: "metadata.source = 'bulk_test'", + }; + + await expect( + indexTestStore.createIndex(options) + ).resolves.not.toThrow(); + await delay(1000); + } + }); + }); +}); From c3c1b180d05f883d83953948197cad9460e8e95c Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Mon, 9 Jun 2025 13:56:39 -0700 Subject: [PATCH 4/6] Initial Docs --- .../vectorstores/couchbase_query.mdx | 387 ++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 docs/core_docs/docs/integrations/vectorstores/couchbase_query.mdx diff --git a/docs/core_docs/docs/integrations/vectorstores/couchbase_query.mdx b/docs/core_docs/docs/integrations/vectorstores/couchbase_query.mdx new file mode 100644 index 000000000000..110e439bfb92 --- /dev/null +++ b/docs/core_docs/docs/integrations/vectorstores/couchbase_query.mdx @@ -0,0 +1,387 @@ +--- +hide_table_of_contents: true +sidebar_class_name: node-only +--- + +import CodeBlock from "@theme/CodeBlock"; + +# Couchbase Query Vector Store + +[Couchbase](http://couchbase.com/) is an award-winning distributed NoSQL cloud database that delivers unmatched versatility, performance, scalability, and financial value for all of your cloud, mobile, AI, and edge computing applications. + +The `CouchbaseQueryVectorStore` is an implementation that uses Couchbase's Query service (SQL++) for vector similarity search instead of the Search service. This provides an alternative approach for vector operations using SQL++ queries with vector functions. + +## Key Differences from CouchbaseVectorStore + +- **Query Service**: Uses Couchbase's Query service with SQL++ instead of the Search service +- **No Index Required**: Does not require a pre-configured search index for basic operations +- **SQL++ Syntax**: Supports WHERE clauses and SQL++ query syntax for filtering +- **Vector Functions**: Uses `APPROX_VECTOR_DISTANCE` function for similarity calculations +- **Distance Strategies**: Supports multiple distance strategies (Euclidean, Cosine, Dot Product) + +## Installation + +```bash npm2yarn +npm install couchbase @langchain/openai @langchain/community @langchain/core +``` + +## Create Couchbase Connection Object + +We create a connection to the Couchbase cluster initially and then pass the cluster object to the Vector Store. Here, we are connecting using the username and password. +You can also connect using any other supported way to your cluster. + +For more information on connecting to the Couchbase cluster, please check the [Node SDK documentation](https://docs.couchbase.com/nodejs-sdk/current/hello-world/start-using-sdk.html#connect). + +```typescript +import { Cluster } from "couchbase"; + +const connectionString = "couchbase://localhost"; // or couchbases://localhost if you are using TLS +const dbUsername = "Administrator"; // valid database user with read access to the bucket being queried +const dbPassword = "Password"; // password for the database user + +const couchbaseClient = await Cluster.connect(connectionString, { + username: dbUsername, + password: dbPassword, + configProfile: "wanDevelopment", +}); +``` + +## Basic Setup + +```typescript +import { CouchbaseQueryVectorStore, DistanceStrategy } from "@langchain/community/vectorstores/couchbase_query"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { Cluster } from "couchbase"; + +// Connect to Couchbase +const cluster = await Cluster.connect("couchbase://localhost", { + username: "Administrator", + password: "password", +}); + +// Initialize embeddings +const embeddings = new OpenAIEmbeddings(); + +// Configure the vector store +const vectorStore = await CouchbaseQueryVectorStore.initialize(embeddings, { + cluster, + bucketName: "my-bucket", + scopeName: "my-scope", + collectionName: "my-collection", + textKey: "text", // optional, defaults to "text" + embeddingKey: "embedding", // optional, defaults to "embedding" + distanceStrategy: DistanceStrategy.COSINE, // optional, defaults to DOT +}); +``` + +## Creating Vector Indexes + +The Query vector store supports creating vector indexes to improve search performance. There are two types of indexes available: + +### BHIVE Index +A specialized vector index optimized for vector operations using Couchbase's vector indexing capabilities: + +```typescript +import { IndexType } from "@langchain/community/vectorstores/couchbase_query"; + +await vectorStore.createIndex({ + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + indexName: "my_vector_index", // optional + vectorDimension: 1536, // optional, auto-detected from embeddings + distanceMetric: DistanceStrategy.COSINE, // optional, uses store default + fields: ["text", "metadata"], // optional, defaults to text field + whereClause: "type = 'document'", // optional filter + indexScanNprobes: 10, // optional tuning parameter + indexTrainlist: 1000, // optional tuning parameter +}); +``` + +**Generated SQL++:** +```sql +CREATE VECTOR INDEX `my_vector_index` ON `bucket`.`scope`.`collection` +(`embedding` VECTOR) INCLUDE (`text`, `metadata`) +WHERE type = 'document' USING GSI WITH {'dimension': 1536, 'similarity': 'cosine', 'description': 'IVF,SQ8'} +``` + +### Composite Index +A general-purpose GSI index that includes vector fields alongside scalar fields: + +```typescript +await vectorStore.createIndex({ + indexType: IndexType.COMPOSITE, + indexDescription: "IVF1024,SQ8", + indexName: "my_composite_index", + vectorDimension: 1536, + fields: ["text", "metadata.category"], + whereClause: "created_date > '2023-01-01'", + indexScanNprobes: 3, + indexTrainlist: 10000, +}); +``` + +**Generated SQL++:** +```sql +CREATE INDEX `my_composite_index` ON `bucket`.`scope`.`collection` +(`text`, `metadata.category`, `embedding` VECTOR) +WHERE created_date > '2023-01-01' USING GSI +WITH {'dimension': 1536, 'similarity': 'dot', 'description': 'IVF1024,SQ8', 'scan_nprobes': 3, 'trainlist': 10000} +``` + +### Key Differences + +| Aspect | BHIVE Index | COMPOSITE Index | +|--------|-------------|-----------------| +| **SQL++ Syntax** | `CREATE VECTOR INDEX` | `CREATE INDEX` | +| **Vector Field** | `(field VECTOR)` with `INCLUDE` clause | `(field1, field2, vector_field VECTOR)` | +| **Vector Parameters** | Supports all vector parameters | Supports all vector parameters | +| **Optimization** | Specialized for vector operations | General-purpose GSI with vector support | +| **Use Case** | Pure vector similarity search | Mixed vector and scalar queries | +| **Performance** | Optimized for vector distance calculations | Good for hybrid queries | +| **Tuning Parameters** | Supports `indexScanNprobes`, `indexTrainlist` | Supports `indexScanNprobes`, `indexTrainlist` | +| **Limitations** | Only one vector field, uses INCLUDE for other fields | One vector field among multiple index keys | + +## Basic Vector Search Example + +The following example showcases how to use Couchbase Query vector search and perform similarity search. + +```typescript +import { OpenAIEmbeddings } from "@langchain/openai"; +import { + CouchbaseQueryVectorStore, + DistanceStrategy, +} from "@langchain/community/vectorstores/couchbase_query"; +import { Cluster } from "couchbase"; +import { Document } from "@langchain/core/documents"; + +const connectionString = process.env.COUCHBASE_DB_CONN_STR ?? "couchbase://localhost"; +const databaseUsername = process.env.COUCHBASE_DB_USERNAME ?? "Administrator"; +const databasePassword = process.env.COUCHBASE_DB_PASSWORD ?? "Password"; + +const couchbaseClient = await Cluster.connect(connectionString, { + username: databaseUsername, + password: databasePassword, + configProfile: "wanDevelopment", +}); + +// OpenAI API Key is required to use OpenAIEmbeddings +const embeddings = new OpenAIEmbeddings({ + apiKey: process.env.OPENAI_API_KEY, +}); + +const vectorStore = await CouchbaseQueryVectorStore.initialize(embeddings, { + cluster: couchbaseClient, + bucketName: "testing", + scopeName: "_default", + collectionName: "_default", + textKey: "text", + embeddingKey: "embedding", + distanceStrategy: DistanceStrategy.COSINE, +}); + +// Add documents +const documents = [ + new Document({ + pageContent: "Couchbase is a NoSQL database", + metadata: { category: "database", type: "document" } + }), + new Document({ + pageContent: "Vector search enables semantic similarity", + metadata: { category: "ai", type: "document" } + }) +]; + +await vectorStore.addDocuments(documents); + +// Perform similarity search +const query = "What is a NoSQL database?"; +const results = await vectorStore.similaritySearch(query, 4); +console.log("Search results:", results[0]); + +// Search with scores +const resultsWithScores = await vectorStore.similaritySearchWithScore(query, 4); +console.log("Document:", resultsWithScores[0][0]); +console.log("Score:", resultsWithScores[0][1]); +``` + +## Searching Documents + +### Basic Similarity Search + +```typescript +// Basic similarity search +const results = await vectorStore.similaritySearch( + "What is a NoSQL database?", + 4 +); +``` + +### Search with Filters + +```typescript +// Search with filters +const filteredResults = await vectorStore.similaritySearch( + "database technology", + 4, + { + where: "metadata.category = 'database'", + fields: ["text", "metadata.category"] + } +); +``` + +### Search with Scores + +```typescript +// Search with scores +const resultsWithScores = await vectorStore.similaritySearchWithScore( + "vector search capabilities", + 4 +); +``` + +### Complex Filtering + +```typescript +const results = await vectorStore.similaritySearch( + "search query", + 10, + { + where: "metadata.category IN ['tech', 'science'] AND metadata.rating >= 4", + fields: ["content", "metadata.title", "metadata.rating"] + } +); +``` + +## Configuration Options + +### Distance Strategies + +- `DistanceStrategy.DOT` - Dot product (default) +- `DistanceStrategy.L2` - L2 (Euclidean) distance +- `DistanceStrategy.EUCLIDEAN` - Euclidean distance +- `DistanceStrategy.COSINE` - Cosine distance +- `DistanceStrategy.L2_SQUARED` - Squared L2 distance +- `DistanceStrategy.EUCLIDEAN_SQUARED` - Squared Euclidean distance + +### Index Types + +- `IndexType.BHIVE` - Specialized vector index for optimal vector search performance +- `IndexType.COMPOSITE` - General-purpose index that can include vector and scalar fields + +## Advanced Usage + +### Custom Vector Fields + +```typescript +const vectorStore = await CouchbaseQueryVectorStore.initialize(embeddings, { + cluster, + bucketName: "my-bucket", + scopeName: "my-scope", + collectionName: "my-collection", + textKey: "content", + embeddingKey: "vector_embedding", + distanceStrategy: DistanceStrategy.L2, +}); +``` + +### Creating from Texts + +```typescript +const texts = [ + "Couchbase is a NoSQL database", + "Vector search enables semantic similarity" +]; + +const metadatas = [ + { category: "database" }, + { category: "ai" } +]; + +const vectorStore = await CouchbaseQueryVectorStore.fromTexts( + texts, + metadatas, + embeddings, + { + cluster, + bucketName: "my-bucket", + scopeName: "my-scope", + collectionName: "my-collection" + } +); +``` + +### Deleting Documents + +```typescript +const documentIds = ["doc1", "doc2", "doc3"]; +await vectorStore.delete({ ids: documentIds }); +``` + +## Performance Considerations + +1. **Create Indexes**: Use `createIndex()` to create appropriate vector indexes for better performance +2. **Choose Index Type**: + - Use **BHIVE indexes** for pure vector search workloads where you primarily perform similarity searches + - Use **COMPOSITE indexes** for mixed queries that combine vector similarity with scalar field filtering +3. **Tune Parameters**: Adjust `indexScanNprobes` and `indexTrainlist` based on your data size and performance requirements +4. **Filter Early**: Use WHERE clauses to reduce the search space before vector calculations +5. **Index Strategy**: + - **BHIVE**: Better for high-performance vector similarity search with minimal scalar filtering + - **COMPOSITE**: Better when you frequently filter by both vector similarity and scalar fields in the same query + +## Error Handling + +```typescript +try { + await vectorStore.createIndex({ + indexType: IndexType.BHIVE, + indexDescription: "IVF,SQ8", + }); +} catch (error) { + console.error("Index creation failed:", error.message); +} +``` + +### Common Errors + +#### Insufficient Training Data +If you see errors related to insufficient training data, you may need to: +- Increase the `indexTrainlist` parameter (default recommendation: ~50 vectors per centroid) +- Ensure you have enough documents with vector embeddings in your collection +- For collections with < 1 million vectors, use `number_of_vectors / 1000` for centroids +- For larger collections, use `sqrt(number_of_vectors)` for centroids + +## Comparison with CouchbaseVectorStore + +| Feature | CouchbaseQueryVectorStore | CouchbaseVectorStore | +|---------|---------------------------|----------------------| +| Service | Query (SQL++) | Search (FTS) | +| Index Required | Optional (for performance) | Required | +| Query Language | SQL++ WHERE clauses | Search query syntax | +| Vector Functions | APPROX_VECTOR_DISTANCE | VectorQuery API | +| Setup Complexity | Lower | Higher | +| Performance | Good with indexes | Optimized for search | + +
+
+ +# Frequently Asked Questions + +## Question: Do I need to create an index before using CouchbaseQueryVectorStore? + +No, unlike the Search-based CouchbaseVectorStore, the Query-based implementation can work without pre-created indexes. However, creating appropriate vector indexes (BHIVE or COMPOSITE) will significantly improve query performance. + +## Question: When should I use BHIVE vs COMPOSITE indexes? + +- Use **BHIVE indexes** when you primarily perform vector similarity searches with minimal filtering on other fields +- Use **COMPOSITE indexes** when you frequently combine vector similarity with filtering on scalar fields in the same query + +## Question: Can I use both CouchbaseVectorStore and CouchbaseQueryVectorStore on the same data? + +Yes, both can work on the same document structure. However, they use different services (Search vs Query) and have different indexing requirements. + +## Related + +- Vector store [conceptual guide](/docs/concepts/#vectorstores) +- Vector store [how-to guides](/docs/how_to/#vectorstores) From a88d63722131c4270c6258e0cb00abc65b3e3c65 Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Wed, 11 Jun 2025 07:18:23 -0700 Subject: [PATCH 5/6] Adjust `train_list` param --- libs/langchain-community/src/vectorstores/couchbase_query.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain-community/src/vectorstores/couchbase_query.ts b/libs/langchain-community/src/vectorstores/couchbase_query.ts index 4a0ceb3a07a4..ef4ddaaf0911 100644 --- a/libs/langchain-community/src/vectorstores/couchbase_query.ts +++ b/libs/langchain-community/src/vectorstores/couchbase_query.ts @@ -608,7 +608,7 @@ export class CouchbaseQueryVectorStore extends VectorStore { indexParams.scan_nprobes = indexScanNprobes; } if (indexTrainlist) { - indexParams.trainlist = indexTrainlist; + indexParams.train_list = indexTrainlist; } // Add the text field to the fields if empty or if it is not present From 826a054a9d4c7a31110a4cdb5a5dae78a2467ae4 Mon Sep 17 00:00:00 2001 From: Elliot Scribner Date: Wed, 11 Jun 2025 07:19:10 -0700 Subject: [PATCH 6/6] Test Cleanup --- .../tests/couchbase_query.test.ts | 98 +++++++++++++------ 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts index 50560ab7e239..6ac7bd49a5d5 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts @@ -20,12 +20,6 @@ import { IndexType, } from "../couchbase_query.js"; -// Helper function to delay execution -const delay = (ms: number) => - new Promise((resolve) => { - setTimeout(resolve, ms); - }); - describe.skip("CouchbaseQueryVectorStore", () => { // Configuration const config = { @@ -85,8 +79,6 @@ describe.skip("CouchbaseQueryVectorStore", () => { throw err; } } - - await delay(5000); }); beforeEach(async () => { @@ -178,11 +170,6 @@ describe.skip("CouchbaseQueryVectorStore", () => { const batch = documents.slice(i, i + batchSize); const ids = await indexTestStore.addDocuments(batch); allIds.push(...ids); - - // Small delay between batches to avoid overwhelming the system - if (i + batchSize < documents.length) { - await delay(100); - } } return allIds; }; @@ -359,9 +346,6 @@ describe.skip("CouchbaseQueryVectorStore", () => { // Add documents in batches for better performance bulkDocumentIds = await addDocumentsInBatches(bulkDocuments, 100); - - // Wait a bit for documents to be indexed - await delay(10000); }); afterAll(async () => { @@ -409,6 +393,7 @@ describe.skip("CouchbaseQueryVectorStore", () => { fields: ["text", "metadata"], whereClause: "metadata.source = 'bulk_test'", indexScanNprobes: 10, + indexTrainlist: 1024, }; // Test that createIndex doesn't throw an error @@ -416,8 +401,14 @@ describe.skip("CouchbaseQueryVectorStore", () => { indexTestStore.createIndex(createBhiveIndexOptions) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createBhiveIndexOptions.indexName + ) + ).toBe(true); }); test("should create COMPOSITE vector index", async () => { @@ -430,6 +421,7 @@ describe.skip("CouchbaseQueryVectorStore", () => { fields: ["text", "metadata.category"], whereClause: "metadata.source = 'bulk_test'", indexScanNprobes: 3, + indexTrainlist: 1024, }; // Test that createIndex doesn't throw an error @@ -437,8 +429,14 @@ describe.skip("CouchbaseQueryVectorStore", () => { indexTestStore.createIndex(createCompositeIndexOptions) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createCompositeIndexOptions.indexName + ) + ).toBe(true); }); test("should create index with minimal options", async () => { @@ -454,8 +452,12 @@ describe.skip("CouchbaseQueryVectorStore", () => { indexTestStore.createIndex(minimalOptions) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === minimalOptions.indexName) + ).toBe(true); }); test("should auto-detect vector dimension from embeddings", async () => { @@ -471,8 +473,14 @@ describe.skip("CouchbaseQueryVectorStore", () => { indexTestStore.createIndex(optionsWithoutDimension) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === optionsWithoutDimension.indexName + ) + ).toBe(true); }); test("should handle index creation errors gracefully", async () => { @@ -486,6 +494,13 @@ describe.skip("CouchbaseQueryVectorStore", () => { await expect( indexTestStore.createIndex(invalidOptions) ).rejects.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === invalidOptions.indexName) + ).toBe(false); }); test("should create both BHIVE and COMPOSITE indexes sequentially", async () => { @@ -509,13 +524,23 @@ describe.skip("CouchbaseQueryVectorStore", () => { await expect( indexTestStore.createIndex(createBhiveIndexOptions) ).resolves.not.toThrow(); - await delay(3000); await expect( indexTestStore.createIndex(createCompositeIndexOptions) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createBhiveIndexOptions.indexName + ) + ).toBe(true); + expect( + indexes.some( + (index) => index.name === createCompositeIndexOptions.indexName + ) + ).toBe(true); }); test("should use default distance strategy when not specified", async () => { @@ -531,8 +556,12 @@ describe.skip("CouchbaseQueryVectorStore", () => { indexTestStore.createIndex(optionsWithoutDistance) ).resolves.not.toThrow(); - // Wait a bit for index creation to process - await delay(2000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === optionsWithoutDistance.indexName) + ).toBe(true); }); test("should handle different distance strategies", async () => { @@ -541,6 +570,8 @@ describe.skip("CouchbaseQueryVectorStore", () => { DistanceStrategy.L2, DistanceStrategy.EUCLIDEAN, DistanceStrategy.COSINE, + DistanceStrategy.L2_SQUARED, + DistanceStrategy.EUCLIDEAN_SQUARED, ]; for (let i = 0; i < distanceStrategies.length; i += 1) { @@ -555,8 +586,13 @@ describe.skip("CouchbaseQueryVectorStore", () => { await expect( indexTestStore.createIndex(options) ).resolves.not.toThrow(); - await delay(1000); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect(indexes.some((index) => index.name === options.indexName)).toBe( + true + ); } - }); + }, 60000); }); });