Skip to content

Commit

Permalink
Store embedding in SQLite to avoid recomputation unless title or desc…
Browse files Browse the repository at this point in the history
…ription changed
  • Loading branch information
crohr committed Sep 6, 2023
1 parent cde60c2 commit 83197d1
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
*.local

/data.ms
/data_meili
/data_tmp
/data_preparation/**/*
!/data_preparation/organisations.yml
Expand Down
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ GEM
meilisearch (~> 0.23.0)
meta-tags (2.18.0)
actionpack (>= 3.2.0, < 7.1)
mini_mime (1.1.2)
mini_mime (1.1.5)
minitest (5.18.0)
msgpack (1.7.0)
multi_xml (0.6.0)
Expand Down
2 changes: 1 addition & 1 deletion Procfile.dev
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
web: bin/rails server -p 3000
vite: bin/vite dev
search: docker rm -f rubyvideo-meilisearch && docker run --name rubyvideo-meilisearch -p 7700:7700 -v $(pwd)/meili_data:/data.ms getmeili/meilisearch:v1.3.2 meilisearch --env development --no-analytics --log-level=INFO
search: docker rm -f rubyvideo-meilisearch && docker run --name rubyvideo-meilisearch -p 7700:7700 -v $(pwd)/meili_data:/data_meili getmeili/meilisearch:v1.3.2 meilisearch --env development --no-analytics --log-level=INFO
13 changes: 13 additions & 0 deletions app/models/ai.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
class Ai
def self.embedding(*inputs)
return nil unless ENV["OPENAI_ACCESS_TOKEN"].present?
client = OpenAI::Client.new
response = client.embeddings(
parameters: {
model: "text-embedding-ada-002",
input: inputs.join("\n\n")
}
)
response.dig("data", 0, "embedding")
end
end
97 changes: 66 additions & 31 deletions app/models/talk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,36 @@
# Table name: talks
#
# id :integer not null, primary key
# title :string default(""), not null
# date :date
# description :text default(""), not null
# embedding :json
# like_count :integer
# slug :string default(""), not null
# video_id :string default(""), not null
# video_provider :string default(""), not null
# thumbnail_sm :string default(""), not null
# thumbnail_md :string default(""), not null
# thumbnail_lg :string default(""), not null
# thumbnail_md :string default(""), not null
# thumbnail_sm :string default(""), not null
# thumbnail_xl :string default(""), not null
# thumbnail_xs :string default(""), not null
# title :string default(""), not null
# video_provider :string default(""), not null
# view_count :integer
# year :integer
# created_at :datetime not null
# updated_at :datetime not null
# event_id :integer
# thumbnail_xs :string default(""), not null
# thumbnail_xl :string default(""), not null
# date :date
# like_count :integer
# view_count :integer
# video_id :string default(""), not null
#
# Indexes
#
# index_talks_on_date (date)
# index_talks_on_event_id (event_id)
# index_talks_on_slug (slug)
# index_talks_on_title (title)
#
# Foreign Keys
#
# event_id (event_id => events.id)
#
# rubocop:enable Layout/LineLength
class Talk < ApplicationRecord
include Sluggable
include Suggestable
Expand All @@ -41,19 +52,26 @@ class Talk < ApplicationRecord
# delegates
delegate :name, to: :event, prefix: true, allow_nil: true

before_save :compute_embedding, if: :must_compute_embedding?

# search
meilisearch enqueue: true, raise_on_failure: Rails.env.development? do
meilisearch primary_key: :id, enqueue: true, raise_on_failure: Rails.env.development? do
attribute :title
attribute :description
attribute :slug
attribute :video_id
# ⚠️ This `video_id` attribute makes indexing (silently) fail with v1.3.2 of meilisearch. Error message from meilisearch (GET /tasks):
# "The primary key inference failed as the engine found 2 fields ending with `id` in their names: 'id' and 'video_id'. Please specify the primary key manually using the `primaryKey` query parameter"
# Adding a custom primary_key: :id above didn't make any difference, so removing this attribute for now.
# attribute :video_id
attribute :video_provider
attribute :thumbnail_sm
attribute :thumbnail_md
attribute :thumbnail_lg
attribute :speakers do
speakers.pluck(:name)
end
# ⚠️ This must return nil and not an empty array if no vector is available.
# Otherwise all other indexing tasks with non-zero vector arrays will silently fail, since the engine will expect all vectors to have the same length.
attribute :_vectors
searchable_attributes [:title, :description]
sortable_attributes [:title]
Expand All @@ -62,10 +80,27 @@ class Talk < ApplicationRecord
attributes_to_highlight ["*"]
end

# https://github.com/meilisearch/meilisearch-rails#custom-attribute-definition
# this doesn't work yet, because _vectors is not seen as an attribute
def will_save_change_to__vectors?
will_save_change_to_title? || will_save_change_to_description?
# Recomputes embedding for all talks that don't have one yet.
def self.reembed!(sleep_interval: 2.seconds, limit: nil)
# required for querying vectors (not indexing)
MeiliSearch::Rails.client.http_patch "/experimental-features", {vectorStore: true}

Talk.where(embedding: nil).limit(limit).in_batches(of: 10) do |talks|
talks.each do |talk|
talk.compute_embedding
talk.save!
end
# seems to help with not getting rate-limited by OpenAI
sleep sleep_interval
end

if Talk.where(embedding: nil).exists?
Rails.logger.warn "Some talks are still missing their embedding. You should re-run the task"
false
else
Rails.logger.info "Good job, all talks have their embedding."
true
end
end

def to_meta_tags
Expand Down Expand Up @@ -96,25 +131,25 @@ def thumbnail_xl
end

def neighbors(limit: 5)
query_vector = Talk.index.document(self.id).fetch("_vectors", []).first
current_talk = Talk.index.document(id)
query_vector = current_talk.fetch("_vectors", [])
return Talk.none if query_vector.blank?
Talk.search("", vector: query_vector, limit: limit, filter: "id != #{self.id}")
Talk.search("", vector: query_vector, limit: limit, filter: "id != #{id}")
rescue MeiliSearch::ApiError => e
Rails.logger.error("MeiliSearch error: #{e.message}")
Talk.none
end

def _vectors
return nil unless ENV["OPENAI_ACCESS_TOKEN"].present?
# might need to split at some point if over the token limit (e.g. if including transcription)
@_vectors ||= [self.class.embedding(title, description)]
embedding
end

def compute_embedding
Rails.logger.info "Computing embedding for talk #{id}"
self.embedding = Ai.embedding(title, description)
end

def self.embedding(*inputs)
client = OpenAI::Client.new
response = client.embeddings(
parameters: {
model: "text-embedding-ada-002",
input: inputs.join("\n\n"),
},
)
response.dig("data", 0, "embedding")
private def must_compute_embedding?
embedding.nil? || will_save_change_to_title? || will_save_change_to_description?
end
end
2 changes: 1 addition & 1 deletion config/initializers/openai.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
OpenAI.configure do |config|
config.access_token = ENV.fetch("OPENAI_ACCESS_TOKEN")
end
end
end
5 changes: 5 additions & 0 deletions db/migrate/20230906073343_add_embedding_to_talks.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddEmbeddingToTalks < ActiveRecord::Migration[7.1]
def change
add_column :talks, :embedding, :jsonb, null: true
end
end
3 changes: 2 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 83197d1

Please sign in to comment.