Skip to content

Commit

Permalink
issue #38: fix search implementation to drop use of documents view
Browse files Browse the repository at this point in the history
  • Loading branch information
rngadam committed Mar 20, 2024
1 parent fd08308 commit ce18b09
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
1 change: 1 addition & 0 deletions sql/2024-03-20-html_content_to_chunk_chunk_id_idx.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create index html_content_to_chunk_chunk_id_idx on html_content_to_chunk(chunk_id);
11 changes: 11 additions & 0 deletions sql/2024-03-20-md5hash-to-uuid-with-indexes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ALTER TABLE html_content ADD md5hash_uuid uuid;
ALTER TABLE html_content_to_chunk ADD md5hash_uuid uuid;
ALTER TABLE crawl ADD md5hash_uuid uuid;

update html_content set md5hash_uuid = md5hash::uuid;
update html_content_to_chunk set md5hash_uuid = md5hash::uuid;
update crawl set md5hash_uuid = md5hash::uuid;

create index html_content_to_chunk_md5hash_uuid_idx on html_content_to_chunk(md5hash_uuid);
create index html_content_to_chunk_md5hash_uuid_idx on html_content(md5hash_uuid);
create index crawl_md5hash_uuid_idx on crawl(md5hash_uuid);
130 changes: 130 additions & 0 deletions sql/2024-03-20-weighted-search-updated.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
DROP FUNCTION if exists search(text,vector,double precision,integer,jsonb);
CREATE OR REPLACE FUNCTION search(
query text,
query_embedding vector,
match_threshold double precision,
match_count integer,
weights_parameter JSONB
)
RETURNS JSONB
AS
$BODY$
DECLARE
similarity_search boolean;
query_id uuid;
query_result JSONB;
BEGIN

create temp table weights (
score_type score_type,
weight double precision
) on commit drop;

INSERT INTO weights
SELECT key::score_type, value::float
FROM jsonb_each_text(weights_parameter)
WHERE value::float > 0.0;

-- check if there is a query embedding, if not skip similarity search
if query_embedding is null then
similarity_search := false;
else
similarity_search := true;
insert into query(query, embedding) values (query, query_embedding)
on conflict do nothing
returning id, result into query_id, query_result;

if query_result is not null then
return query_result;
end if;
end if;

if similarity_search then
create temp table selected_chunks on commit drop as
select
ada_002.id as embedding_id,
chunk.id as chunk_id,
crawl.id as crawl_id,
score,
score_type,
ada_002.embedding <=> query_embedding as closeness
from ada_002
inner join "token" on ada_002.token_id = "token".id
inner join "chunk" on "token".chunk_id = chunk.id
inner join html_content_to_chunk hctc on hctc.chunk_id = chunk.id
inner join html_content hc on hctc.md5hash_uuid = hc.md5hash_uuid
inner join crawl on hc.md5hash_uuid = crawl.md5hash_uuid
inner join "score" on crawl.id = score.entity_id
order by ada_002.embedding <=> query_embedding
limit match_count*10;


create temp table similarity on commit drop as
with chunk_closest_by_id as (
select c.crawl_id, min(c.closeness) as closeness
from selected_chunks c
group by c.crawl_id
)
select c.crawl_id as crawl_id, c.chunk_id as chunk_id, (1-c.closeness) as score
from selected_chunks c
inner join chunk_closest_by_id cc on c.crawl_id = cc.crawl_id and c.closeness = cc.closeness
order by score desc
limit match_count;

create temp table measurements on commit drop as
select s.crawl_id as id, s.chunk_id, 'similarity'::score_type as score_type, s.score as score
from similarity s;

insert into measurements
select entity_id as id, chunk_id, s.score_type, s.score
from score s
inner join similarity sim on s.entity_id = sim.crawl_id
where s.entity_id in (select m.id from measurements m);
else
-- take the longest chunk per document based on the number of tokens_count
-- grouped by url to avoid duplicates

create or replace temp view measurements as
select entity_id as id, chunk_id, s.score_type, s.score
from default_chunk d
inner join score s on s.entity_id = d.id
where s.score_type in (select w.score_type from weights w);
end if;

with matches as (
select m.id, m.chunk_id,
avg(m.score * w.weight) as score,
jsonb_object_agg(m.score_type, m.score) as scores
from measurements m
inner join weights w on m.score_type = w.score_type
group by m.id, m.chunk_id
order by score desc
limit match_count
)
select json_agg(r) as search from (
select
query_id,
c.id,
c.url,
c.title,
ck.title as subtitle,
ck.text_content as content,
c.last_updated,
m.score,
m.scores
from matches m
inner join crawl c on m.id = c.id
inner join chunk ck on m.chunk_id = ck.id
order by m.score desc
limit match_count
) r into query_result;

if query_id is not null then
update query set result = query_result where id = query_id;
end if;

return query_result;

END;
$BODY$
LANGUAGE plpgsql;

0 comments on commit ce18b09

Please sign in to comment.