From ce18b090a17f793281248f3dd2936683eb7ba6a3 Mon Sep 17 00:00:00 2001 From: Ricky Ng-Adam Date: Wed, 20 Mar 2024 20:44:58 +0000 Subject: [PATCH] issue #38: fix search implementation to drop use of documents view --- ...-20-html_content_to_chunk_chunk_id_idx.sql | 1 + ...024-03-20-md5hash-to-uuid-with-indexes.sql | 11 ++ sql/2024-03-20-weighted-search-updated.sql | 130 ++++++++++++++++++ 3 files changed, 142 insertions(+) create mode 100644 sql/2024-03-20-html_content_to_chunk_chunk_id_idx.sql create mode 100644 sql/2024-03-20-md5hash-to-uuid-with-indexes.sql create mode 100644 sql/2024-03-20-weighted-search-updated.sql diff --git a/sql/2024-03-20-html_content_to_chunk_chunk_id_idx.sql b/sql/2024-03-20-html_content_to_chunk_chunk_id_idx.sql new file mode 100644 index 0000000..86a6d8c --- /dev/null +++ b/sql/2024-03-20-html_content_to_chunk_chunk_id_idx.sql @@ -0,0 +1 @@ +create index html_content_to_chunk_chunk_id_idx on html_content_to_chunk(chunk_id); \ No newline at end of file diff --git a/sql/2024-03-20-md5hash-to-uuid-with-indexes.sql b/sql/2024-03-20-md5hash-to-uuid-with-indexes.sql new file mode 100644 index 0000000..74a5149 --- /dev/null +++ b/sql/2024-03-20-md5hash-to-uuid-with-indexes.sql @@ -0,0 +1,11 @@ +ALTER TABLE html_content ADD md5hash_uuid uuid; +ALTER TABLE html_content_to_chunk ADD md5hash_uuid uuid; +ALTER TABLE crawl ADD md5hash_uuid uuid; + +update html_content set md5hash_uuid = md5hash::uuid; +update html_content_to_chunk set md5hash_uuid = md5hash::uuid; +update crawl set md5hash_uuid = md5hash::uuid; + +create index html_content_to_chunk_md5hash_uuid_idx on html_content_to_chunk(md5hash_uuid); +create index html_content_to_chunk_md5hash_uuid_idx on html_content(md5hash_uuid); +create index crawl_md5hash_uuid_idx on crawl(md5hash_uuid); diff --git a/sql/2024-03-20-weighted-search-updated.sql b/sql/2024-03-20-weighted-search-updated.sql new file mode 100644 index 0000000..837d800 --- /dev/null +++ b/sql/2024-03-20-weighted-search-updated.sql @@ -0,0 +1,130 @@ +DROP FUNCTION if exists search(text,vector,double precision,integer,jsonb); +CREATE OR REPLACE FUNCTION search( + query text, + query_embedding vector, + match_threshold double precision, + match_count integer, + weights_parameter JSONB +) +RETURNS JSONB +AS +$BODY$ +DECLARE + similarity_search boolean; + query_id uuid; + query_result JSONB; +BEGIN + + create temp table weights ( + score_type score_type, + weight double precision + ) on commit drop; + + INSERT INTO weights + SELECT key::score_type, value::float + FROM jsonb_each_text(weights_parameter) + WHERE value::float > 0.0; + + -- check if there is a query embedding, if not skip similarity search + if query_embedding is null then + similarity_search := false; + else + similarity_search := true; + insert into query(query, embedding) values (query, query_embedding) + on conflict do nothing + returning id, result into query_id, query_result; + + if query_result is not null then + return query_result; + end if; + end if; + + if similarity_search then + create temp table selected_chunks on commit drop as + select + ada_002.id as embedding_id, + chunk.id as chunk_id, + crawl.id as crawl_id, + score, + score_type, + ada_002.embedding <=> query_embedding as closeness + from ada_002 + inner join "token" on ada_002.token_id = "token".id + inner join "chunk" on "token".chunk_id = chunk.id + inner join html_content_to_chunk hctc on hctc.chunk_id = chunk.id + inner join html_content hc on hctc.md5hash_uuid = hc.md5hash_uuid + inner join crawl on hc.md5hash_uuid = crawl.md5hash_uuid + inner join "score" on crawl.id = score.entity_id + order by ada_002.embedding <=> query_embedding + limit match_count*10; + + + create temp table similarity on commit drop as + with chunk_closest_by_id as ( + select c.crawl_id, min(c.closeness) as closeness + from selected_chunks c + group by c.crawl_id + ) + select c.crawl_id as crawl_id, c.chunk_id as chunk_id, (1-c.closeness) as score + from selected_chunks c + inner join chunk_closest_by_id cc on c.crawl_id = cc.crawl_id and c.closeness = cc.closeness + order by score desc + limit match_count; + + create temp table measurements on commit drop as + select s.crawl_id as id, s.chunk_id, 'similarity'::score_type as score_type, s.score as score + from similarity s; + + insert into measurements + select entity_id as id, chunk_id, s.score_type, s.score + from score s + inner join similarity sim on s.entity_id = sim.crawl_id + where s.entity_id in (select m.id from measurements m); + else + -- take the longest chunk per document based on the number of tokens_count + -- grouped by url to avoid duplicates + + create or replace temp view measurements as + select entity_id as id, chunk_id, s.score_type, s.score + from default_chunk d + inner join score s on s.entity_id = d.id + where s.score_type in (select w.score_type from weights w); + end if; + + with matches as ( + select m.id, m.chunk_id, + avg(m.score * w.weight) as score, + jsonb_object_agg(m.score_type, m.score) as scores + from measurements m + inner join weights w on m.score_type = w.score_type + group by m.id, m.chunk_id + order by score desc + limit match_count + ) + select json_agg(r) as search from ( + select + query_id, + c.id, + c.url, + c.title, + ck.title as subtitle, + ck.text_content as content, + c.last_updated, + m.score, + m.scores + from matches m + inner join crawl c on m.id = c.id + inner join chunk ck on m.chunk_id = ck.id + order by m.score desc + limit match_count + ) r into query_result; + + if query_id is not null then + update query set result = query_result where id = query_id; + end if; + + return query_result; + +END; +$BODY$ +LANGUAGE plpgsql;