Skip to content

Commit

Permalink
Add metadata support for documents
Browse files Browse the repository at this point in the history
Add metadata column to pgvector storage to allow associating additional
information with stored documents. This enables tracking metadata like
source documents, page numbers, or any other document-specific information.

- Add JSONB metadata column to schema
- Update add_texts to support metadata parameter
- Update update_texts to support metadata parameter
- Add metadata support to similarity search results
- Add tests for metadata functionality

BREAKING CHANGE: Schema update requires existing tables to be recreated
or manually altered to add metadata column
  • Loading branch information
aellispierce committed Oct 28, 2024
1 parent 621226a commit a0d6595
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 10 deletions.
44 changes: 34 additions & 10 deletions lib/langchain/vectorsearch/pgvector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,43 +51,65 @@ def documents_model
# Upsert a list of texts to the index
# @param texts [Array<String>] The texts to add to the index
# @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
# @return [PG::Result] The response from the database including the ids of
# the added or updated texts.
def upsert_texts(texts:, ids:)
data = texts.zip(ids).flat_map do |(text, id)|
{id: id, content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
def upsert_texts(texts:, ids:, metadata: nil)
metadata = Array.new(texts.size, {}) if metadata.nil?

data = texts.zip(ids, metadata).flat_map do |text, id, meta|
{
id: id,
content: text,
vectors: llm.embed(text: text).embedding.to_s,
namespace: namespace,
metadata: meta.to_json
}
end
# @db[table_name.to_sym].multi_insert(data, return: :primary_key)
@db[table_name.to_sym]
.insert_conflict(
target: :id,
update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
update: {
content: Sequel[:excluded][:content],
vectors: Sequel[:excluded][:vectors],
metadata: Sequel[:excluded][:metadata]
}
)
.multi_insert(data, return: :primary_key)
end

# Add a list of texts to the index
# @param texts [Array<String>] The texts to add to the index
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
# @return [Array<Integer>] The the ids of the added texts.
def add_texts(texts:, ids: nil)
def add_texts(texts:, ids: nil, metadata: nil)
metadata = Array.new(texts.size, {}) if metadata.nil?

if ids.nil? || ids.empty?
data = texts.map do |text|
{content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
data = texts.zip(metadata).map do |text, meta|
{
content: text,
vectors: llm.embed(text: text).embedding.to_s,
namespace: namespace,
metadata: meta.to_json
}
end

@db[table_name.to_sym].multi_insert(data, return: :primary_key)
else
upsert_texts(texts: texts, ids: ids)
upsert_texts(texts: texts, ids: ids, metadata: metadata)
end
end

# Update a list of ids and corresponding texts to the index
# @param texts [Array<String>] The texts to add to the index
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
# @return [Array<Integer>] The ids of the updated texts.
def update_texts(texts:, ids:)
upsert_texts(texts: texts, ids: ids)
def update_texts(texts:, ids:, metadata: nil)
upsert_texts(texts: texts, ids: ids, metadata: metadata)
end

# Remove a list of texts from the index
Expand All @@ -107,6 +129,7 @@ def create_default_schema
text :content
column :vectors, "vector(#{vector_dimensions})"
text namespace_column.to_sym, default: nil
jsonb :metadata, default: "{}"
end
end

Expand Down Expand Up @@ -136,6 +159,7 @@ def similarity_search(query:, k: 4)
def similarity_search_by_vector(embedding:, k: 4)
db.transaction do # BEGIN
documents_model
.select(:content, :metadata)
.nearest_neighbors(:vectors, embedding, distance: operator).limit(k)
.where(namespace_column.to_sym => namespace)
end
Expand Down
63 changes: 63 additions & 0 deletions spec/langchain/vectorsearch/pgvector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@
result = subject.add_texts(texts: ["Hello World", "Hello World"])
expect(result.size).to eq(2)
end

it "adds texts with metadata" do
metadata = [
{"source" => "doc1", "page" => 1},
{"source" => "doc2", "page" => 2}
]
result = subject.add_texts(
texts: ["Hello World", "Hello World"],
metadata: metadata
)

expect(result.size).to eq(2)

stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [result[0], result[1]])
expect(JSON.parse(stored_records[0]["metadata"])).to match(metadata[0])
expect(JSON.parse(stored_records[1]["metadata"])).to match(metadata[1])
end
end

describe "#update_texts" do
Expand Down Expand Up @@ -96,6 +113,36 @@
count = client.exec_params(count_query)
expect(count[0]["count"].to_i).to eq(2)
end

it "updates texts and metadata" do
initial_metadata = [
{"source" => "doc1", "page" => 1},
{"source" => "doc2", "page" => 2}
]

values = subject.add_texts(
texts: ["Hello World", "Hello World"],
metadata: initial_metadata
)

updated_metadata = [
{"source" => "doc1", "page" => 1, "updated" => true},
{"source" => "doc2", "page" => 3}
]

ids = values.flatten
result = subject.update_texts(
texts: ["Hello World", "Hello World".reverse],
ids: ids,
metadata: updated_metadata
)

expect(result.size).to eq(2)

stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [ids[0], ids[1]])
expect(JSON.parse(stored_records[0]["metadata"])).to match(updated_metadata[0])
expect(JSON.parse(stored_records[1]["metadata"])).to match(updated_metadata[1])
end
end

describe "#remove_texts" do
Expand Down Expand Up @@ -170,6 +217,22 @@
result = subject.similarity_search(query: "earth")
expect(result.first.content).to eq("a namespaced chunk of text")
end

it "searches for similar texts with metadata and namespace" do
namespace = "foo_namespace"

subject.documents_model.new(
content: "a namespaced chunk of text",
vectors: 1536.times.map { 0 },
namespace: namespace,
metadata: {source: "earth_doc", page: 1}.to_json
).save

allow(subject).to receive(:namespace).and_return(namespace)
result = subject.similarity_search(query: "earth")
expect(result.first.content).to eq("a namespaced chunk of text")
expect(JSON.parse(result.first.metadata)).to match({"source" => "earth_doc", "page" => 1})
end
end

describe "#similarity_search_by_vector" do
Expand Down

0 comments on commit a0d6595

Please sign in to comment.