Add metadata support for documents

Add metadata column to pgvector storage to allow associating additional information with stored documents. This enables tracking metadata like source documents, page numbers, or any other document-specific information. - Add JSONB metadata column to schema - Update add_texts to support metadata parameter - Update update_texts to support metadata parameter - Add metadata support to similarity search results - Add tests for metadata functionality BREAKING CHANGE: Schema update requires existing tables to be recreated or manually altered to add metadata column
patterns-ai-core · Oct 28, 2024 · a0d6595 · a0d6595
1 parent 621226a
commit a0d6595
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 10 deletions.
diff --git a/lib/langchain/vectorsearch/pgvector.rb b/lib/langchain/vectorsearch/pgvector.rb
@@ -51,43 +51,65 @@ def documents_model
     # Upsert a list of texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [PG::Result] The response from the database including the ids of
     # the added or updated texts.
-    def upsert_texts(texts:, ids:)
-      data = texts.zip(ids).flat_map do |(text, id)|
-        {id: id, content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
+    def upsert_texts(texts:, ids:, metadata: nil)
+      metadata = Array.new(texts.size, {}) if metadata.nil?
+
+      data = texts.zip(ids, metadata).flat_map do |text, id, meta|
+        {
+          id: id,
+          content: text,
+          vectors: llm.embed(text: text).embedding.to_s,
+          namespace: namespace,
+          metadata: meta.to_json
+        }
       end
       # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
       @db[table_name.to_sym]
         .insert_conflict(
           target: :id,
-          update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
+          update: {
+            content: Sequel[:excluded][:content],
+            vectors: Sequel[:excluded][:vectors],
+            metadata: Sequel[:excluded][:metadata]
+          }
         )
         .multi_insert(data, return: :primary_key)
     end
 
     # Add a list of texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [Array<Integer>] The the ids of the added texts.
-    def add_texts(texts:, ids: nil)
+    def add_texts(texts:, ids: nil, metadata: nil)
+      metadata = Array.new(texts.size, {}) if metadata.nil?
+
       if ids.nil? || ids.empty?
-        data = texts.map do |text|
-          {content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
+        data = texts.zip(metadata).map do |text, meta|
+          {
+            content: text,
+            vectors: llm.embed(text: text).embedding.to_s,
+            namespace: namespace,
+            metadata: meta.to_json
+          }
         end
 
         @db[table_name.to_sym].multi_insert(data, return: :primary_key)
       else
-        upsert_texts(texts: texts, ids: ids)
+        upsert_texts(texts: texts, ids: ids, metadata: metadata)
       end
     end
 
     # Update a list of ids and corresponding texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [Array<Integer>] The ids of the updated texts.
-    def update_texts(texts:, ids:)
-      upsert_texts(texts: texts, ids: ids)
+    def update_texts(texts:, ids:, metadata: nil)
+      upsert_texts(texts: texts, ids: ids, metadata: metadata)
     end
 
     # Remove a list of texts from the index
@@ -107,6 +129,7 @@ def create_default_schema
         text :content
         column :vectors, "vector(#{vector_dimensions})"
         text namespace_column.to_sym, default: nil
+        jsonb :metadata, default: "{}"
       end
     end
 
@@ -136,6 +159,7 @@ def similarity_search(query:, k: 4)
     def similarity_search_by_vector(embedding:, k: 4)
       db.transaction do # BEGIN
         documents_model
+          .select(:content, :metadata)
           .nearest_neighbors(:vectors, embedding, distance: operator).limit(k)
           .where(namespace_column.to_sym => namespace)
       end

diff --git a/spec/langchain/vectorsearch/pgvector_spec.rb b/spec/langchain/vectorsearch/pgvector_spec.rb
@@ -45,6 +45,23 @@
         result = subject.add_texts(texts: ["Hello World", "Hello World"])
         expect(result.size).to eq(2)
       end
+
+      it "adds texts with metadata" do
+        metadata = [
+          {"source" => "doc1", "page" => 1},
+          {"source" => "doc2", "page" => 2}
+        ]
+        result = subject.add_texts(
+          texts: ["Hello World", "Hello World"],
+          metadata: metadata
+        )
+
+        expect(result.size).to eq(2)
+
+        stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [result[0], result[1]])
+        expect(JSON.parse(stored_records[0]["metadata"])).to match(metadata[0])
+        expect(JSON.parse(stored_records[1]["metadata"])).to match(metadata[1])
+      end
     end
 
     describe "#update_texts" do
@@ -96,6 +113,36 @@
         count = client.exec_params(count_query)
         expect(count[0]["count"].to_i).to eq(2)
       end
+
+      it "updates texts and metadata" do
+        initial_metadata = [
+          {"source" => "doc1", "page" => 1},
+          {"source" => "doc2", "page" => 2}
+        ]
+
+        values = subject.add_texts(
+          texts: ["Hello World", "Hello World"],
+          metadata: initial_metadata
+        )
+
+        updated_metadata = [
+          {"source" => "doc1", "page" => 1, "updated" => true},
+          {"source" => "doc2", "page" => 3}
+        ]
+
+        ids = values.flatten
+        result = subject.update_texts(
+          texts: ["Hello World", "Hello World".reverse],
+          ids: ids,
+          metadata: updated_metadata
+        )
+
+        expect(result.size).to eq(2)
+
+        stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [ids[0], ids[1]])
+        expect(JSON.parse(stored_records[0]["metadata"])).to match(updated_metadata[0])
+        expect(JSON.parse(stored_records[1]["metadata"])).to match(updated_metadata[1])
+      end
     end
 
     describe "#remove_texts" do
@@ -170,6 +217,22 @@
         result = subject.similarity_search(query: "earth")
         expect(result.first.content).to eq("a namespaced chunk of text")
       end
+
+      it "searches for similar texts with metadata and namespace" do
+        namespace = "foo_namespace"
+
+        subject.documents_model.new(
+          content: "a namespaced chunk of text",
+          vectors: 1536.times.map { 0 },
+          namespace: namespace,
+          metadata: {source: "earth_doc", page: 1}.to_json
+        ).save
+
+        allow(subject).to receive(:namespace).and_return(namespace)
+        result = subject.similarity_search(query: "earth")
+        expect(result.first.content).to eq("a namespaced chunk of text")
+        expect(JSON.parse(result.first.metadata)).to match({"source" => "earth_doc", "page" => 1})
+      end
     end
 
     describe "#similarity_search_by_vector" do