From ac8f340f64ab695704ba48f4f158ebe482ec3caf Mon Sep 17 00:00:00 2001 From: James Guthrie Date: Mon, 6 Jan 2025 16:20:46 +0100 Subject: [PATCH] fix: handle 'null' value in chunking 'chunk_column' The chunking configuration takes a 'chunk_column' parameter, which determines which column of the source row is chunked. This commit treats a null entry as being equivalent to the empty string, and no embeddings are generated. --- projects/pgai/pgai/vectorizer/chunking.py | 6 ++- .../tests/vectorizer/test_vectorizer_cli.py | 53 +++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/projects/pgai/pgai/vectorizer/chunking.py b/projects/pgai/pgai/vectorizer/chunking.py index 549fc80fd..7cf6ccb53 100644 --- a/projects/pgai/pgai/vectorizer/chunking.py +++ b/projects/pgai/pgai/vectorizer/chunking.py @@ -80,7 +80,8 @@ def into_chunks(self, item: dict[str, Any]) -> list[str]: Returns: list[str]: A list of chunked strings. """ - return self._chunker.split_text(item[self.chunk_column]) + text = item[self.chunk_column] or "" + return self._chunker.split_text(text) class LangChainRecursiveCharacterTextSplitter(BaseModel, Chunker): @@ -126,4 +127,5 @@ def into_chunks(self, item: dict[str, Any]) -> list[str]: Returns: list[str]: A list of chunked strings. """ - return self._chunker.split_text(item[self.chunk_column]) + text = item[self.chunk_column] or "" + return self._chunker.split_text(text) diff --git a/projects/pgai/tests/vectorizer/test_vectorizer_cli.py b/projects/pgai/tests/vectorizer/test_vectorizer_cli.py index 9ca7e2ad5..04f97154c 100644 --- a/projects/pgai/tests/vectorizer/test_vectorizer_cli.py +++ b/projects/pgai/tests/vectorizer/test_vectorizer_cli.py @@ -956,3 +956,56 @@ def test_recursive_character_splitting( assert sequences == list( range(len(sequences)) ), "Chunk sequences should be sequential starting from 0" + + +@pytest.mark.parametrize( + "test_params", + [ + ( + 1, + 1, + 1, + "chunking_character_text_splitter('content')", + "formatting_python_template('$chunk')", + ), + ( + 1, + 1, + 1, + "chunking_recursive_character_text_splitter('content')", + "formatting_python_template('$chunk')", + ), + ], +) +def test_vectorization_successful_with_null_contents( + cli_db: tuple[PostgresContainer, Connection], + cli_db_url: str, + configured_ollama_vectorizer_id: int, + test_params: tuple[int, int, int, str, str], # noqa: ARG001 +): + _, conn = cli_db + + with conn.cursor(row_factory=dict_row) as cur: + cur.execute("ALTER TABLE blog ALTER COLUMN content DROP NOT NULL;") + cur.execute("UPDATE blog SET content = null;") + + result = CliRunner().invoke( + vectorizer_worker, + [ + "--db-url", + cli_db_url, + "--once", + "--vectorizer-id", + str(configured_ollama_vectorizer_id), + ], + catch_exceptions=False, + ) + + assert not result.exception + assert result.exit_code == 0 + + _, conn = cli_db + + with conn.cursor(row_factory=dict_row) as cur: + cur.execute("SELECT count(*) as count FROM blog_embedding_store;") + assert cur.fetchone()["count"] == 0 # type: ignore