From 83197d11f480baf532e2c86c8f86ff87d452a7a3 Mon Sep 17 00:00:00 2001 From: Cyril Rohr Date: Wed, 6 Sep 2023 08:58:50 +0000 Subject: [PATCH] Store embedding in SQLite to avoid recomputation unless title or description changed --- .gitignore | 1 + Gemfile.lock | 2 +- Procfile.dev | 2 +- app/models/ai.rb | 13 +++ app/models/talk.rb | 97 +++++++++++++------ config/initializers/openai.rb | 2 +- .../20230906073343_add_embedding_to_talks.rb | 5 + db/schema.rb | 3 +- 8 files changed, 90 insertions(+), 35 deletions(-) create mode 100644 app/models/ai.rb create mode 100644 db/migrate/20230906073343_add_embedding_to_talks.rb diff --git a/.gitignore b/.gitignore index 9cf06003..d29f57da 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ *.local /data.ms +/data_meili /data_tmp /data_preparation/**/* !/data_preparation/organisations.yml diff --git a/Gemfile.lock b/Gemfile.lock index a9d37cf4..a12b7046 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -203,7 +203,7 @@ GEM meilisearch (~> 0.23.0) meta-tags (2.18.0) actionpack (>= 3.2.0, < 7.1) - mini_mime (1.1.2) + mini_mime (1.1.5) minitest (5.18.0) msgpack (1.7.0) multi_xml (0.6.0) diff --git a/Procfile.dev b/Procfile.dev index cd1f7eff..806c4806 100644 --- a/Procfile.dev +++ b/Procfile.dev @@ -1,3 +1,3 @@ web: bin/rails server -p 3000 vite: bin/vite dev -search: docker rm -f rubyvideo-meilisearch && docker run --name rubyvideo-meilisearch -p 7700:7700 -v $(pwd)/meili_data:/data.ms getmeili/meilisearch:v1.3.2 meilisearch --env development --no-analytics --log-level=INFO \ No newline at end of file +search: docker rm -f rubyvideo-meilisearch && docker run --name rubyvideo-meilisearch -p 7700:7700 -v $(pwd)/meili_data:/data_meili getmeili/meilisearch:v1.3.2 meilisearch --env development --no-analytics --log-level=INFO \ No newline at end of file diff --git a/app/models/ai.rb b/app/models/ai.rb new file mode 100644 index 00000000..f475270d --- /dev/null +++ b/app/models/ai.rb @@ -0,0 +1,13 @@ +class Ai + def self.embedding(*inputs) + return nil unless ENV["OPENAI_ACCESS_TOKEN"].present? + client = OpenAI::Client.new + response = client.embeddings( + parameters: { + model: "text-embedding-ada-002", + input: inputs.join("\n\n") + } + ) + response.dig("data", 0, "embedding") + end +end diff --git a/app/models/talk.rb b/app/models/talk.rb index 7042ff94..3f3d430b 100644 --- a/app/models/talk.rb +++ b/app/models/talk.rb @@ -4,25 +4,36 @@ # Table name: talks # # id :integer not null, primary key -# title :string default(""), not null +# date :date # description :text default(""), not null +# embedding :json +# like_count :integer # slug :string default(""), not null -# video_id :string default(""), not null -# video_provider :string default(""), not null -# thumbnail_sm :string default(""), not null -# thumbnail_md :string default(""), not null # thumbnail_lg :string default(""), not null +# thumbnail_md :string default(""), not null +# thumbnail_sm :string default(""), not null +# thumbnail_xl :string default(""), not null +# thumbnail_xs :string default(""), not null +# title :string default(""), not null +# video_provider :string default(""), not null +# view_count :integer # year :integer # created_at :datetime not null # updated_at :datetime not null # event_id :integer -# thumbnail_xs :string default(""), not null -# thumbnail_xl :string default(""), not null -# date :date -# like_count :integer -# view_count :integer +# video_id :string default(""), not null +# +# Indexes +# +# index_talks_on_date (date) +# index_talks_on_event_id (event_id) +# index_talks_on_slug (slug) +# index_talks_on_title (title) +# +# Foreign Keys +# +# event_id (event_id => events.id) # -# rubocop:enable Layout/LineLength class Talk < ApplicationRecord include Sluggable include Suggestable @@ -41,12 +52,17 @@ class Talk < ApplicationRecord # delegates delegate :name, to: :event, prefix: true, allow_nil: true + before_save :compute_embedding, if: :must_compute_embedding? + # search - meilisearch enqueue: true, raise_on_failure: Rails.env.development? do + meilisearch primary_key: :id, enqueue: true, raise_on_failure: Rails.env.development? do attribute :title attribute :description attribute :slug - attribute :video_id + # ⚠️ This `video_id` attribute makes indexing (silently) fail with v1.3.2 of meilisearch. Error message from meilisearch (GET /tasks): + # "The primary key inference failed as the engine found 2 fields ending with `id` in their names: 'id' and 'video_id'. Please specify the primary key manually using the `primaryKey` query parameter" + # Adding a custom primary_key: :id above didn't make any difference, so removing this attribute for now. + # attribute :video_id attribute :video_provider attribute :thumbnail_sm attribute :thumbnail_md @@ -54,6 +70,8 @@ class Talk < ApplicationRecord attribute :speakers do speakers.pluck(:name) end + # ⚠️ This must return nil and not an empty array if no vector is available. + # Otherwise all other indexing tasks with non-zero vector arrays will silently fail, since the engine will expect all vectors to have the same length. attribute :_vectors searchable_attributes [:title, :description] sortable_attributes [:title] @@ -62,10 +80,27 @@ class Talk < ApplicationRecord attributes_to_highlight ["*"] end - # https://github.com/meilisearch/meilisearch-rails#custom-attribute-definition - # this doesn't work yet, because _vectors is not seen as an attribute - def will_save_change_to__vectors? - will_save_change_to_title? || will_save_change_to_description? + # Recomputes embedding for all talks that don't have one yet. + def self.reembed!(sleep_interval: 2.seconds, limit: nil) + # required for querying vectors (not indexing) + MeiliSearch::Rails.client.http_patch "/experimental-features", {vectorStore: true} + + Talk.where(embedding: nil).limit(limit).in_batches(of: 10) do |talks| + talks.each do |talk| + talk.compute_embedding + talk.save! + end + # seems to help with not getting rate-limited by OpenAI + sleep sleep_interval + end + + if Talk.where(embedding: nil).exists? + Rails.logger.warn "Some talks are still missing their embedding. You should re-run the task" + false + else + Rails.logger.info "Good job, all talks have their embedding." + true + end end def to_meta_tags @@ -96,25 +131,25 @@ def thumbnail_xl end def neighbors(limit: 5) - query_vector = Talk.index.document(self.id).fetch("_vectors", []).first + current_talk = Talk.index.document(id) + query_vector = current_talk.fetch("_vectors", []) return Talk.none if query_vector.blank? - Talk.search("", vector: query_vector, limit: limit, filter: "id != #{self.id}") + Talk.search("", vector: query_vector, limit: limit, filter: "id != #{id}") + rescue MeiliSearch::ApiError => e + Rails.logger.error("MeiliSearch error: #{e.message}") + Talk.none end def _vectors - return nil unless ENV["OPENAI_ACCESS_TOKEN"].present? - # might need to split at some point if over the token limit (e.g. if including transcription) - @_vectors ||= [self.class.embedding(title, description)] + embedding + end + + def compute_embedding + Rails.logger.info "Computing embedding for talk #{id}" + self.embedding = Ai.embedding(title, description) end - def self.embedding(*inputs) - client = OpenAI::Client.new - response = client.embeddings( - parameters: { - model: "text-embedding-ada-002", - input: inputs.join("\n\n"), - }, - ) - response.dig("data", 0, "embedding") + private def must_compute_embedding? + embedding.nil? || will_save_change_to_title? || will_save_change_to_description? end end diff --git a/config/initializers/openai.rb b/config/initializers/openai.rb index 4865492b..789e64d7 100644 --- a/config/initializers/openai.rb +++ b/config/initializers/openai.rb @@ -2,4 +2,4 @@ OpenAI.configure do |config| config.access_token = ENV.fetch("OPENAI_ACCESS_TOKEN") end -end \ No newline at end of file +end diff --git a/db/migrate/20230906073343_add_embedding_to_talks.rb b/db/migrate/20230906073343_add_embedding_to_talks.rb new file mode 100644 index 00000000..bea1479f --- /dev/null +++ b/db/migrate/20230906073343_add_embedding_to_talks.rb @@ -0,0 +1,5 @@ +class AddEmbeddingToTalks < ActiveRecord::Migration[7.1] + def change + add_column :talks, :embedding, :jsonb, null: true + end +end diff --git a/db/schema.rb b/db/schema.rb index 57157f5e..04069374 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2023_07_20_151537) do +ActiveRecord::Schema[7.1].define(version: 2023_09_06_073343) do create_table "ahoy_events", force: :cascade do |t| t.integer "visit_id" t.integer "user_id" @@ -153,6 +153,7 @@ t.date "date" t.integer "like_count" t.integer "view_count" + t.json "embedding" t.index ["date"], name: "index_talks_on_date" t.index ["event_id"], name: "index_talks_on_event_id" t.index ["slug"], name: "index_talks_on_slug"