From 2e446d0f2fc67703077519d446f341a219b8c528 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 7 Oct 2025 16:48:24 -0600 Subject: [PATCH 1/4] update query for keyword search Signed-off-by: Matthew Peveler --- src/apis/kewordSearchTigerDocs.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apis/kewordSearchTigerDocs.ts b/src/apis/kewordSearchTigerDocs.ts index e3acdb3..26578b9 100644 --- a/src/apis/kewordSearchTigerDocs.ts +++ b/src/apis/kewordSearchTigerDocs.ts @@ -64,9 +64,9 @@ SELECT id::int, content, metadata::text, - -(content <@> to_tpquery($1, 'docs.timescale_chunks_content_idx')) as score + -(content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx')) as score FROM ${schema}.timescale_chunks - ORDER BY content <@> to_tpquery($1, 'docs.timescale_chunks_content_idx') + ORDER BY content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx') LIMIT $2 `, [keywords, limit || 10], From 5193d301b1bec20889e18adb84dc93e6ff766573 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 7 Oct 2025 17:45:00 -0600 Subject: [PATCH 2/4] fix handling insert when index exists Signed-off-by: Matthew Peveler --- ingest/tiger_docs.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/ingest/tiger_docs.py b/ingest/tiger_docs.py index 351d306..b2d5a49 100644 --- a/ingest/tiger_docs.py +++ b/ingest/tiger_docs.py @@ -63,21 +63,38 @@ class DatabaseManager: def __init__(self, database_uri, embedding_model=None): self.database_uri = database_uri self.embedding_model = embedding_model + self.finalize_queries: list[SQL] = [] + try: self.connection = psycopg.connect(self.database_uri) except Exception as e: raise RuntimeError(f"Database connection failed: {e}") - def create_tmp_tables(self): + def initialize(self): with self.connection.cursor() as cursor: cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks_tmp").format(schema=Identifier(schema))) cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_pages_tmp").format(schema=Identifier(schema))) cursor.execute(SQL("CREATE TABLE {schema}.timescale_pages_tmp (LIKE {schema}.timescale_pages INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema))) cursor.execute(SQL("CREATE TABLE {schema}.timescale_chunks_tmp (LIKE {schema}.timescale_chunks INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema))) cursor.execute(SQL("ALTER TABLE {schema}.timescale_chunks_tmp ADD FOREIGN KEY (page_id) REFERENCES {schema}.timescale_pages_tmp(id) ON DELETE CASCADE").format(schema=Identifier(schema))) + + # The bm25 indexes have a bug that prevent inserting data into a table + # underneath non-public schemas that has them, so we need to make remove + # them from the tmp tables and recreate them after renaming. + cursor.execute(""" + SELECT indexdef + FROM pg_indexes + WHERE schemaname = %s + AND tablename LIKE %s + AND indexdef LIKE %s + """, ['docs', 'timescale%_tmp_%', '%bm25%']) + for row in cursor.fetchall(): + index_def = row[0] + tmp_index_def = index_def.replace('_tmp', '') + self.finalize_queries.append(SQL(tmp_index_def)) self.connection.commit() - def rename_objects(self): + def finalize(self): """Rename the temporary tables and their indexes to the final names, dropping the old tables if they exist""" with self.connection.cursor() as cursor: cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks").format(schema=Identifier(schema))) @@ -135,6 +152,9 @@ def rename_objects(self): ) ) + for query in self.finalize_queries: + cursor.execute(query) + self.connection.commit() def save_page(self, url, domain, filename, content_length, chunking_method='header'): @@ -1035,7 +1055,7 @@ def get_text_embeddings(self, texts): embedding_model = OpenAIEmbeddingWrapper(client) db_manager = DatabaseManager(database_uri=args.database_uri, embedding_model=embedding_model) - db_manager.create_tmp_tables() + db_manager.initialize() else: file_manager = FileManager(args.output_dir) @@ -1056,9 +1076,9 @@ def get_text_embeddings(self, texts): # Create database indexes after scraping completes if args.storage_type == 'database' and not args.skip_indexes and db_manager: try: - print("Renaming temporary tables to final names...") - db_manager.rename_objects() - print("Database indexes created successfully!") + print("Finalizing database...") + db_manager.finalize() + print("Database finalized successfully.") except Exception as e: print(f"Failed to finish database: {e}") raise SystemExit(1) From 183fc40d8da51030097680e3cf32b47eb572da89 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 7 Oct 2025 20:06:32 -0600 Subject: [PATCH 3/4] actually drop index after adding to finalize_queries Signed-off-by: Matthew Peveler --- ingest/tiger_docs.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/ingest/tiger_docs.py b/ingest/tiger_docs.py index b2d5a49..36baa7b 100644 --- a/ingest/tiger_docs.py +++ b/ingest/tiger_docs.py @@ -81,16 +81,27 @@ def initialize(self): # The bm25 indexes have a bug that prevent inserting data into a table # underneath non-public schemas that has them, so we need to make remove # them from the tmp tables and recreate them after renaming. - cursor.execute(""" - SELECT indexdef + cursor.execute( + """ + SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = %s AND tablename LIKE %s AND indexdef LIKE %s - """, ['docs', 'timescale%_tmp_%', '%bm25%']) - for row in cursor.fetchall(): - index_def = row[0] - tmp_index_def = index_def.replace('_tmp', '') + """, + ["docs", "timescale%_tmp_%", "%bm25%"], + ) + rows = cursor.fetchall() + for row in rows: + index_name = row[0] + index_def = row[1] + tmp_index_def = index_def.replace("_tmp", "") + cursor.execute( + SQL("DROP INDEX IF EXISTS {schema}.{index_name}").format( + schema=Identifier(schema), + index_name=Identifier(index_name), + ) + ) self.finalize_queries.append(SQL(tmp_index_def)) self.connection.commit() From db0ef4ca318eaff6e9975ddc2200d448e7b0ae48 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Tue, 7 Oct 2025 20:17:57 -0600 Subject: [PATCH 4/4] fix tablename where for tmp table Signed-off-by: Matthew Peveler --- ingest/tiger_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/tiger_docs.py b/ingest/tiger_docs.py index 36baa7b..314fe97 100644 --- a/ingest/tiger_docs.py +++ b/ingest/tiger_docs.py @@ -89,7 +89,7 @@ def initialize(self): AND tablename LIKE %s AND indexdef LIKE %s """, - ["docs", "timescale%_tmp_%", "%bm25%"], + ["docs", "timescale%_tmp%", "%bm25%"], ) rows = cursor.fetchall() for row in rows: