Skip to content

Commit

Permalink
Adding pgvector as embedding store
Browse files Browse the repository at this point in the history
adding pgvector as embedding store

detect if pgvector is installable

make sure the exception is related to the missing extension

make dimension config property mandatory

Update pgvector/runtime/src/main/resources/META-INF/quarkus-extension.yaml

Update pgvector/runtime/src/main/resources/META-INF/quarkus-extension.yaml

various refactoring based on feedback and added a real test

deleted readme

Update pgvector/deployment/src/main/java/io/quarkiverse/langchain4j/pgvector/deployment/Langchain4jPgvectorProcessor.java

remove useless description

Pinecone embedding store

adding pgvector as embedding store

add documentation

generated config doc and added pgvector to doc's pom

pgvector store
  • Loading branch information
sebastienblanc committed Dec 4, 2023
1 parent 1ba5659 commit 2deb50b
Show file tree
Hide file tree
Showing 15 changed files with 1,057 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package io.quarkiverse.langchain4j.samples;

import static dev.langchain4j.data.document.splitter.DocumentSplitters.recursive;

import java.util.List;

import jakarta.enterprise.context.ApplicationScoped;
import jakarta.inject.Inject;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor;
import io.quarkiverse.langchain4j.pgvector.PgVectorEmbeddingStore;

@ApplicationScoped
public class IngestorExampleWithPgvector {

/**
* The embedding store (the database).
* The bean is provided by the quarkus-langchain4j-pgvector extension.
*/
@Inject
PgVectorEmbeddingStore store;

/**
* The embedding model (how is computed the vector of a document).
* The bean is provided by the LLM (like openai) extension.
*/
@Inject
EmbeddingModel embeddingModel;

public void ingest(List<Document> documents) {
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(500, 0))
.build();
// Warning - this can take a long time...
ingestor.ingest(documents);
}
}
114 changes: 114 additions & 0 deletions docs/modules/ROOT/pages/includes/quarkus-langchain4j-pgvector.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@

:summaryTableId: quarkus-langchain4j-pgvector
[.configuration-legend]
icon:lock[title=Fixed at build time] Configuration property fixed at build time - All other configuration properties are overridable at runtime
[.configuration-reference.searchable, cols="80,.^10,.^10"]
|===

h|[[quarkus-langchain4j-pgvector_configuration]]link:#quarkus-langchain4j-pgvector_configuration[Configuration property]

h|Type
h|Default

a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.table]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.table[quarkus.langchain4j.pgvector.table]`


[.description]
--
The table name for storing embeddings

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_TABLE+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_TABLE+++`
endif::add-copy-button-to-env-var[]
--|string
|`embeddings`


a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.dimension]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.dimension[quarkus.langchain4j.pgvector.dimension]`


[.description]
--
The dimension of the embedding vectors. This has to be the same as the dimension of vectors produced by the embedding model that you use. For example, AllMiniLmL6V2QuantizedEmbeddingModel produces vectors of dimension 384. OpenAI's text-embedding-ada-002 produces vectors of dimension 1536.

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_DIMENSION+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_DIMENSION+++`
endif::add-copy-button-to-env-var[]
--|int
|required icon:exclamation-circle[title=Configuration property is required]


a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.use-index]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.use-index[quarkus.langchain4j.pgvector.use-index]`


[.description]
--
Use index or not

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_USE_INDEX+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_USE_INDEX+++`
endif::add-copy-button-to-env-var[]
--|boolean
|`false`


a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.index-list-size]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.index-list-size[quarkus.langchain4j.pgvector.index-list-size]`


[.description]
--
index size

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_INDEX_LIST_SIZE+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_INDEX_LIST_SIZE+++`
endif::add-copy-button-to-env-var[]
--|int
|`0`


a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.create-table]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.create-table[quarkus.langchain4j.pgvector.create-table]`


[.description]
--
Create table or not

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_CREATE_TABLE+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_CREATE_TABLE+++`
endif::add-copy-button-to-env-var[]
--|boolean
|`true`


a| [[quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.drop-table-first]]`link:#quarkus-langchain4j-pgvector_quarkus.langchain4j.pgvector.drop-table-first[quarkus.langchain4j.pgvector.drop-table-first]`


[.description]
--
Drop table or not

ifdef::add-copy-button-to-env-var[]
Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_PGVECTOR_DROP_TABLE_FIRST+++[]
endif::add-copy-button-to-env-var[]
ifndef::add-copy-button-to-env-var[]
Environment variable: `+++QUARKUS_LANGCHAIN4J_PGVECTOR_DROP_TABLE_FIRST+++`
endif::add-copy-button-to-env-var[]
--|boolean
|`false`

|===
42 changes: 42 additions & 0 deletions docs/modules/ROOT/pages/pgvector-store.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
= Pgvector Document Store for Retrieval Augmented Generation (RAG)

include::./includes/attributes.adoc[]

When implementing Retrieval Augmented Generation (RAG), a capable document store is necessary. This guide will explain how to leverage a pgvector database as the document store.

== Leveraging the pgvector Document Store

To utilize the Redis document store, you'll need to include the following dependency:

[source,xml,subs=attributes+]
----
<dependency>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-pgvector</artifactId>
<version>{project-version}</version>
</dependency>
----

This extension will check for a default datasource, ensure you have defined at least one datasource. For detailed guidance, refer to the link:https://quarkus.io/guides/datasource[CONFIGURE DATA SOURCES IN QUARKUS].

IMPORTANT: If you plan to use `devservices` be sure to use this property : `quarkus.datasource.devservices.image-name=ankane/pgvector:v0.5.1`.

IMPORTANT: The pgvector store requires the dimension of the vector to be set. Add the `quarkus.langchain4j.pgvector.dimension` property to your `application.properties` file and set it to the dimension of the vector. The dimension depends on the embedding model you use.
For example, `AllMiniLmL6V2QuantizedEmbeddingModel` produces vectors of dimension 384. OpenAI’s `text-embedding-ada-002` produces vectors of dimension 1536.

Upon installing the extension, you can utilize the pgvector store using the following code:

[source,java]
----
include::{examples-dir}/io/quarkiverse/langchain4j/samples/IngestorExampleWithPgvector.java[]
----

== Configuration Settings

Customize the behavior of the extension by exploring various configuration options:

include::includes/quarkus-langchain4j-pgvector.adoc[leveloffset=+1,opts=optional]

== Under the Hood

Each ingested document is saved as a row in a Postgres table, containing the embedding column stored as a vector.
14 changes: 0 additions & 14 deletions docs/modules/ROOT/pages/pinecone-store.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,6 @@ To make use of the Pinecone document store, you'll need to include the following
</dependency>
----

The required configuration properties to make the extension work are
`quarkus.langchain4j.pinecone.api-key`,
`quarkus.langchain4j.pinecone.environment`,
`quarkus.langchain4j.pinecone.index-name`, and
`quarkus.langchain4j.pinecone.project-id`. The specified index will be
created if it doesn't exist yet.

Upon installing the extension, you can utilize the Pinecone embedding store using the following code:

[source,java]
----
include::{examples-dir}/io/quarkiverse/langchain4j/samples/IngestorExampleWithPinecone.java[]
----

== Configuration Settings

Customize the behavior of the extension by exploring various configuration options:
Expand Down
6 changes: 6 additions & 0 deletions docs/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@
<artifactId>quarkus-langchain4j-pinecone-deployment</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-pgvector-deployment</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-hugging-face-deployment</artifactId>
Expand Down Expand Up @@ -121,6 +126,7 @@
<include>quarkus-langchain4j-redis.adoc</include>
<include>quarkus-langchain4j-chroma.adoc</include>
<include>quarkus-langchain4j-pinecone.adoc</include>
<include>quarkus-langchain4j-pgvector.adoc</include>
<filtering>false</filtering>
</resource>
<resource>
Expand Down
79 changes: 79 additions & 0 deletions pgvector/deployment/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-pgvector-parent</artifactId>
<version>999-SNAPSHOT</version>
</parent>
<artifactId>quarkus-langchain4j-pgvector-deployment</artifactId>
<name>Quarkus langchain4j-pgvector - Deployment</name>
<dependencies>
<dependency>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-pgvector</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-arc-deployment</artifactId>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-jackson-deployment</artifactId>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-agroal-deployment</artifactId>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-jdbc-postgresql-deployment</artifactId>
</dependency>
<dependency>
<groupId>io.quarkiverse.langchain4j</groupId>
<artifactId>quarkus-langchain4j-core-deployment</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-junit5-internal</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>${assertj.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.wiremock</groupId>
<artifactId>wiremock-standalone</artifactId>
<version>${wiremock.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-embeddings-all-minilm-l6-v2-q</artifactId>
<version>${langchain4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessorPaths>
<path>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-extension-processor</artifactId>
<version>${quarkus.version}</version>
</path>
</annotationProcessorPaths>
</configuration>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package io.quarkiverse.langchain4j.pgvector.deployment;

import jakarta.enterprise.context.ApplicationScoped;

import org.jboss.jandex.ClassType;
import org.jboss.jandex.DotName;
import org.jboss.jandex.ParameterizedType;

import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.store.embedding.EmbeddingStore;
import io.agroal.api.AgroalDataSource;
import io.quarkiverse.langchain4j.pgvector.PgVectorEmbeddingStore;
import io.quarkiverse.langchain4j.pgvector.runtime.PgVectorEmbeddingStoreConfig;
import io.quarkiverse.langchain4j.pgvector.runtime.PgVectorEmbeddingStoreRecorder;
import io.quarkus.arc.deployment.SyntheticBeanBuildItem;
import io.quarkus.deployment.annotations.BuildProducer;
import io.quarkus.deployment.annotations.BuildStep;
import io.quarkus.deployment.annotations.ExecutionTime;
import io.quarkus.deployment.annotations.Record;
import io.quarkus.deployment.builditem.FeatureBuildItem;

class Langchain4jPgvectorProcessor {

public static final DotName PGVECTOR_EMBEDDING_STORE = DotName.createSimple(PgVectorEmbeddingStore.class);

private static final String FEATURE = "langchain4j-pgvector";

@BuildStep
FeatureBuildItem feature() {
return new FeatureBuildItem(FEATURE);
}

@BuildStep
@Record(ExecutionTime.RUNTIME_INIT)
public void createBean(
BuildProducer<SyntheticBeanBuildItem> beanProducer,
PgVectorEmbeddingStoreRecorder recorder,
PgVectorEmbeddingStoreConfig config) {
beanProducer.produce(SyntheticBeanBuildItem
.configure(PGVECTOR_EMBEDDING_STORE)
.types(ClassType.create(EmbeddingStore.class),
ParameterizedType.create(EmbeddingStore.class, ClassType.create(TextSegment.class)))
.setRuntimeInit()
.defaultBean()
.scope(ApplicationScoped.class)
.addInjectionPoint(ClassType.create(DotName.createSimple(AgroalDataSource.class)))
.createWith(recorder.embeddingStoreFunction(config))
.done());

}
}
Loading

0 comments on commit 2deb50b

Please sign in to comment.