diff --git a/projects/pgai/pgai/alembic/autogenerate.py b/projects/pgai/pgai/alembic/autogenerate.py index 63a06bba8..ea54a69ba 100644 --- a/projects/pgai/pgai/alembic/autogenerate.py +++ b/projects/pgai/pgai/alembic/autogenerate.py @@ -27,7 +27,9 @@ class ExistingVectorizer: @comparators.dispatch_for("schema") def compare_vectorizers( - autogen_context: AutogenContext, upgrade_ops: UpgradeOps, schemas: list[str] + autogen_context: AutogenContext, + upgrade_ops: UpgradeOps, + schemas: list[str], # noqa: ARG001 ): """Compare vectorizers between model and database, generating appropriate migration operations.""" @@ -46,11 +48,14 @@ def compare_vectorizers( ).fetchall() for row in result: - parsed_vectorizer = Vectorizer.model_validate(row.vectorizer) # type: ignore + parsed_vectorizer = Vectorizer.model_validate(row.vectorizer) # type: ignore existing_vectorizer = ExistingVectorizer( - parsed_vectorizer.id, CreateVectorizerParams.from_db_config(parsed_vectorizer) + parsed_vectorizer.id, + CreateVectorizerParams.from_db_config(parsed_vectorizer), + ) + target_table = ( + f"{parsed_vectorizer.target_schema}.{parsed_vectorizer.target_table}" ) - target_table = f"{parsed_vectorizer.target_schema}.{parsed_vectorizer.target_table}" existing_vectorizers[target_table] = existing_vectorizer # Get vectorizers from models model_vectorizers: dict[str, CreateVectorizerParams] = {} diff --git a/projects/pgai/pgai/alembic/operations.py b/projects/pgai/pgai/alembic/operations.py index 359b4e89b..4604e562a 100644 --- a/projects/pgai/pgai/alembic/operations.py +++ b/projects/pgai/pgai/alembic/operations.py @@ -8,9 +8,9 @@ ChunkingConfig, CreateVectorizerParams, DiskANNIndexingConfig, - OpenAIEmbeddingConfig, HNSWIndexingConfig, NoSchedulingConfig, + OpenAIEmbeddingConfig, ProcessingConfig, SchedulingConfig, ) diff --git a/projects/pgai/pgai/configuration.py b/projects/pgai/pgai/configuration.py index b189e88da..ef76c1517 100644 --- a/projects/pgai/pgai/configuration.py +++ b/projects/pgai/pgai/configuration.py @@ -1,5 +1,5 @@ import textwrap -from dataclasses import dataclass, fields +from dataclasses import dataclass, fields, replace from typing import Any, Literal, Protocol, runtime_checkable from alembic.autogenerate.api import AutogenContext @@ -10,11 +10,11 @@ LangChainCharacterTextSplitter, LangChainRecursiveCharacterTextSplitter, ) -from pgai.vectorizer.embeddings import OpenAI, Ollama +from pgai.vectorizer.embeddings import Ollama, OpenAI from pgai.vectorizer.formatting import ChunkValue, PythonTemplate -from pgai.vectorizer.indexing import DiskANNIndexing, HNSWIndexing +from pgai.vectorizer.indexing import DiskANNIndexing, HNSWIndexing, NoIndexing from pgai.vectorizer.processing import ProcessingDefault -from pgai.vectorizer.scheduling import TimescaleScheduling +from pgai.vectorizer.scheduling import NoScheduling, TimescaleScheduling def equivalent_value(a: Any, b: Any, default: Any) -> bool: @@ -153,9 +153,9 @@ def from_db_config(cls, config: Ollama) -> "OllamaEmbeddingConfig": return cls( model=config.model, dimensions=config.dimensions, - base_url = config.base_url, - truncate = config.truncate, - keep_alive = config.keep_alive + base_url=config.base_url, + truncate=config.truncate, + keep_alive=config.keep_alive, ) @@ -176,6 +176,19 @@ class ChunkingConfig: @override def __eq__(self, other: object) -> bool: + if not isinstance(other, ChunkingConfig): + return False + # Handle the separator special case + if self.separator is not None and other.separator is not None: + if isinstance(self.separator, str) and isinstance(other.separator, list): + other = replace( + other, + separator=[other.separator[0]] + if len(other.separator) == 1 + else other.separator, + ) + elif isinstance(self.separator, list) and isinstance(other.separator, str): + other = replace(other, separator=[other.separator]) return equivalent_dataclass_with_defaults(self, other, self._defaults) def to_sql_argument(self) -> str: @@ -230,6 +243,23 @@ def from_db_config( ) +@dataclass +class NoIndexingConfig: + @override + def __eq__(self, other: object) -> bool: + return isinstance(other, NoIndexingConfig) + + def to_sql_argument(self) -> str: + return ", indexing => ai.indexing_none()" + + def to_python_arg(self) -> str: + return format_python_arg("indexing", self) + + @classmethod + def from_db_config(cls, config: NoIndexing) -> "NoIndexingConfig": # noqa: ARG003 + return cls() + + @dataclass class DiskANNIndexingConfig: min_rows: int | None = None @@ -271,7 +301,7 @@ def to_sql_argument(self) -> str: def to_python_arg(self) -> str: return format_python_arg("indexing", self) - + @classmethod def from_db_config(cls, config: DiskANNIndexing) -> "DiskANNIndexingConfig": return cls( @@ -289,7 +319,7 @@ def from_db_config(cls, config: DiskANNIndexing) -> "DiskANNIndexingConfig": @dataclass class HNSWIndexingConfig: min_rows: int | None = None - opclass: Literal["vector_cosine_ops", "vector_l2_ops", "vector_ip_ops"] | None = ( + opclass: Literal["vector_cosine_ops", "vector_l1_ops", "vector_ip_ops"] | None = ( None ) m: int | None = None @@ -324,7 +354,7 @@ def to_sql_argument(self) -> str: def to_python_arg(self) -> str: return format_python_arg("indexing", self) - + @classmethod def from_db_config(cls, config: HNSWIndexing) -> "HNSWIndexingConfig": return cls( @@ -348,6 +378,10 @@ def to_sql_argument(self) -> str: def to_python_arg(self) -> str: return format_python_arg("scheduling", self) + @classmethod + def from_db_config(cls, config: NoScheduling) -> "NoSchedulingConfig": # noqa: ARG003 + return cls() + @dataclass class SchedulingConfig: @@ -376,7 +410,7 @@ def to_sql_argument(self) -> str: def to_python_arg(self) -> str: return format_python_arg("scheduling", self) - + @classmethod def from_db_config(cls, config: TimescaleScheduling) -> "SchedulingConfig": return cls( @@ -430,7 +464,9 @@ class CreateVectorizerParams: source_table: str | None embedding: OpenAIEmbeddingConfig | OllamaEmbeddingConfig | None = None chunking: ChunkingConfig | None = None - indexing: DiskANNIndexingConfig | HNSWIndexingConfig | None = None + indexing: DiskANNIndexingConfig | HNSWIndexingConfig | NoIndexingConfig | None = ( + None + ) formatting_template: str | None = None scheduling: SchedulingConfig | NoSchedulingConfig | None = None processing: ProcessingConfig | None = None @@ -448,6 +484,7 @@ class CreateVectorizerParams: "enqueue_existing": True, "processing": ProcessingConfig(), "scheduling": NoSchedulingConfig(), + "indexing": NoIndexingConfig(), "queue_schema": "ai", } @@ -555,20 +592,42 @@ def from_db_config(cls, vectorizer: Vectorizer) -> "CreateVectorizerParams": """ embedding_config: None | OpenAIEmbeddingConfig | OllamaEmbeddingConfig = None if isinstance(vectorizer.config.embedding, OpenAI): - embedding_config = OpenAIEmbeddingConfig.from_db_config(vectorizer.config.embedding) + embedding_config = OpenAIEmbeddingConfig.from_db_config( + vectorizer.config.embedding + ) if isinstance(vectorizer.config.embedding, Ollama): - embedding_config = OllamaEmbeddingConfig.from_db_config(vectorizer.config.embedding) + embedding_config = OllamaEmbeddingConfig.from_db_config( + vectorizer.config.embedding + ) chunking_config = ChunkingConfig.from_db_config(vectorizer.config.chunking) - processing_config = ProcessingConfig.from_db_config(vectorizer.config.processing) - indexing_config: None | DiskANNIndexingConfig | HNSWIndexingConfig = None + processing_config = ProcessingConfig.from_db_config( + vectorizer.config.processing + ) + indexing_config: ( + None | DiskANNIndexingConfig | HNSWIndexingConfig | NoIndexingConfig + ) = None if isinstance(vectorizer.config.indexing, DiskANNIndexing): - indexing_config = DiskANNIndexingConfig.from_db_config(vectorizer.config.indexing) + indexing_config = DiskANNIndexingConfig.from_db_config( + vectorizer.config.indexing + ) if isinstance(vectorizer.config.indexing, HNSWIndexing): - indexing_config = HNSWIndexingConfig.from_db_config(vectorizer.config.indexing) - + indexing_config = HNSWIndexingConfig.from_db_config( + vectorizer.config.indexing + ) + if isinstance(vectorizer.config.indexing, NoIndexing): + indexing_config = NoIndexingConfig.from_db_config( + vectorizer.config.indexing + ) + scheduling_config: None | NoSchedulingConfig | SchedulingConfig = None if isinstance(vectorizer.config.scheduling, TimescaleScheduling): - scheduling_config = SchedulingConfig.from_db_config(vectorizer.config.scheduling) + scheduling_config = SchedulingConfig.from_db_config( + vectorizer.config.scheduling + ) + if isinstance(vectorizer.config.scheduling, NoScheduling): + scheduling_config = NoSchedulingConfig.from_db_config( + vectorizer.config.scheduling + ) # Get formatting template formatting_template = None diff --git a/projects/pgai/pgai/sqlalchemy/__init__.py b/projects/pgai/pgai/sqlalchemy/__init__.py index 8245e6922..2bcf6af77 100644 --- a/projects/pgai/pgai/sqlalchemy/__init__.py +++ b/projects/pgai/pgai/sqlalchemy/__init__.py @@ -8,9 +8,9 @@ ChunkingConfig, CreateVectorizerParams, DiskANNIndexingConfig, - OpenAIEmbeddingConfig, HNSWIndexingConfig, NoSchedulingConfig, + OpenAIEmbeddingConfig, ProcessingConfig, SchedulingConfig, ) diff --git a/projects/pgai/pgai/vectorizer/indexing.py b/projects/pgai/pgai/vectorizer/indexing.py index b7e91a794..a9580bf1a 100644 --- a/projects/pgai/pgai/vectorizer/indexing.py +++ b/projects/pgai/pgai/vectorizer/indexing.py @@ -20,17 +20,17 @@ class DiskANNIndexing(BaseModel): num_bytes_per_index: The number of bytes to use per index. num_bytes_per_cluster: The number of bytes to use per cluster. """ - + implementation: Literal["diskann"] min_rows: int - storage_layout: Literal["memory_optimized", "plain"] - num_neighbors: int - search_list_size: int - max_alpha: float - num_dimensions: int - num_bits_per_dimension: int + storage_layout: Literal["memory_optimized", "plain"] | None = None + num_neighbors: int | None = None + search_list_size: int | None = None + max_alpha: float | None = None + num_dimensions: int | None = None + num_bits_per_dimension: int | None = None create_when_queue_empty: bool - + class HNSWIndexing(BaseModel): """ @@ -44,19 +44,18 @@ class HNSWIndexing(BaseModel): num_bytes_per_vector: The number of bytes to use per vector. num_bytes_per_index: The number of bytes to use per index. """ - + implementation: Literal["hnsw"] min_rows: int - opclass: Literal["vector_cosine_ops", "vector_l2_ops", "vector_ip_ops"] + opclass: Literal["vector_cosine_ops", "vector_l1_ops", "vector_ip_ops"] m: int ef_construction: int create_when_queue_empty: bool - + class NoIndexing(BaseModel): """ No indexing configuration. """ - + implementation: Literal["none"] - \ No newline at end of file diff --git a/projects/pgai/pgai/vectorizer/scheduling.py b/projects/pgai/pgai/vectorizer/scheduling.py index c8d96cef9..3d379e332 100644 --- a/projects/pgai/pgai/vectorizer/scheduling.py +++ b/projects/pgai/pgai/vectorizer/scheduling.py @@ -12,17 +12,18 @@ class TimescaleScheduling(BaseModel): interval: The interval at which to run the scheduling. retention_policy: The retention policy to use. """ - + implementation: Literal["timescaledb"] - schedule_interval: datetime.timedelta - initial_start: str - job_id: int + schedule_interval: datetime.timedelta | None = None + initial_start: str | None = None + job_id: int | None = None fixed_schedule: bool - timezone: str + timezone: str | None = None + class NoScheduling(BaseModel): """ No scheduling configuration. """ - - implementation: Literal["none"] \ No newline at end of file + + implementation: Literal["none"] diff --git a/projects/pgai/pgai/vectorizer/vectorizer.py b/projects/pgai/pgai/vectorizer/vectorizer.py index 47a2725c9..84a795a13 100644 --- a/projects/pgai/pgai/vectorizer/vectorizer.py +++ b/projects/pgai/pgai/vectorizer/vectorizer.py @@ -26,7 +26,7 @@ from .formatting import ChunkValue, PythonTemplate from .indexing import DiskANNIndexing, HNSWIndexing, NoIndexing from .processing import ProcessingDefault -from .scheduling import TimescaleScheduling, NoScheduling +from .scheduling import NoScheduling, TimescaleScheduling logger = structlog.get_logger() diff --git a/projects/pgai/tests/vectorizer/conftest.py b/projects/pgai/tests/vectorizer/conftest.py index a447f24fb..f598c148a 100644 --- a/projects/pgai/tests/vectorizer/conftest.py +++ b/projects/pgai/tests/vectorizer/conftest.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Any + import pytest import tiktoken import vcr # type:ignore @@ -78,11 +79,11 @@ def postgres_container(): def timescale_ha_container(): load_dotenv() with PostgresContainer( - image="timescale/timescaledb-ha:pg16", - username="tsdbquerier", - password="my-password", - dbname="tsdb", - driver=None, - command="postgres -c shared_preload_libraries=timescaledb" + image="timescale/timescaledb-ha:pg16", + username="tsdbquerier", + password="my-password", + dbname="tsdb", + driver=None, + command="postgres -c shared_preload_libraries=timescaledb", ).with_env("OPENAI_API_KEY", os.environ["OPENAI_API_KEY"]) as postgres: - yield postgres \ No newline at end of file + yield postgres diff --git a/projects/pgai/tests/vectorizer/extensions/conftest.py b/projects/pgai/tests/vectorizer/extensions/conftest.py index a4b4f8cdd..cb5c86cea 100644 --- a/projects/pgai/tests/vectorizer/extensions/conftest.py +++ b/projects/pgai/tests/vectorizer/extensions/conftest.py @@ -78,7 +78,9 @@ def alembic_config(alembic_dir: Path, postgres_container: PostgresContainer) -> @pytest.fixture -def timescale_alembic_config(alembic_dir: Path, timescale_ha_container: PostgresContainer) -> Config: +def timescale_alembic_config( + alembic_dir: Path, timescale_ha_container: PostgresContainer +) -> Config: """Create a configured Alembic environment.""" # Create alembic.ini from template ini_path = alembic_dir / "alembic.ini" @@ -140,7 +142,7 @@ def initialized_engine( with engine.connect() as conn: conn.execute(text("DROP SCHEMA public CASCADE; CREATE SCHEMA public;")) conn.commit() - + @pytest.fixture def initialized_engine_with_timescale( diff --git a/projects/pgai/tests/vectorizer/extensions/fixtures/migrations/002_create_vectorizer_all_fields.py.template b/projects/pgai/tests/vectorizer/extensions/fixtures/migrations/002_create_vectorizer_all_fields.py.template index c01fc9dc9..b83563198 100644 --- a/projects/pgai/tests/vectorizer/extensions/fixtures/migrations/002_create_vectorizer_all_fields.py.template +++ b/projects/pgai/tests/vectorizer/extensions/fixtures/migrations/002_create_vectorizer_all_fields.py.template @@ -7,7 +7,7 @@ Create Date: {create_date} from alembic import op from pgai.alembic import CreateVectorizerOp, DropVectorizerOp import sqlalchemy as sa -from pgai.configuration import ChunkingConfig, DiskANNIndexingConfig, OpenAIEmbeddingConfig, ProcessingConfig, SchedulingConfig +from pgai.configuration import * # revision identifiers revision = '{revision_id}' @@ -25,30 +25,16 @@ def upgrade(): ) op.create_vectorizer( 'blog_posts', - embedding=OpenAIEmbeddingConfig( - model='text-embedding-3-small', - dimensions=768, - chat_user='test_user', - api_key_name='test_key' - ), - chunking=ChunkingConfig( - chunk_column='content', - chunk_size=500, - chunk_overlap=10, - separator=' ', - is_separator_regex=True - ), - scheduling=SchedulingConfig( - schedule_interval= "1h", - initial_start= "2022-01-01T00:00:00Z", - fixed_schedule= True, - timezone= "UTC" - ), - {indexing}, - processing=ProcessingConfig( - batch_size=10, - concurrency=5 - ), + {embedding} + , + {chunking} + , + {scheduling} + , + {indexing} + , + {processing} + , target_schema='timescale', target_table='blog_posts_embedding', view_schema='timescale', diff --git a/projects/pgai/tests/vectorizer/extensions/test_alembic.py b/projects/pgai/tests/vectorizer/extensions/test_alembic.py index b790e98fd..6b3097773 100644 --- a/projects/pgai/tests/vectorizer/extensions/test_alembic.py +++ b/projects/pgai/tests/vectorizer/extensions/test_alembic.py @@ -1,14 +1,9 @@ -from datetime import timedelta from pathlib import Path from alembic.command import downgrade, upgrade from alembic.config import Config from sqlalchemy import Engine, inspect, text -from pgai.configuration import DiskANNIndexingConfig -from pgai.vectorizer import Vectorizer -from pgai.vectorizer.indexing import DiskANNIndexing -from pgai.vectorizer.scheduling import TimescaleScheduling from tests.vectorizer.extensions.conftest import load_template @@ -130,79 +125,3 @@ def test_vectorizer_migration( inspector = inspect(initialized_engine) tables = inspector.get_table_names() assert "blog" not in tables - - - -def test_vectorizer_migration_all_fields( - alembic_config: Config, - initialized_engine: Engine, - cleanup_modules: None, # noqa: ARG001 -): - """Test vectorizer creation with a bunch of fields""" - migrations_dir = Path(alembic_config.get_main_option("script_location")) # type: ignore - versions_dir = migrations_dir / "versions" - - with initialized_engine.connect() as conn: - conn.execute( - text( - """ - CREATE schema timescale; - CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; - """ - ) - ) - conn.commit() - - # First migration - create blog table - indexing_config = DiskANNIndexingConfig( - min_rows=10, - storage_layout='plain', - num_neighbors=5, - search_list_size=10, - max_alpha=0.5, - num_dimensions=10, - num_bits_per_dimension=10, - create_when_queue_empty=False - ) - blog_content = load_template( - "migrations/001_create_blog_table.py.template", - revision_id="001", - revises="", - create_date="2024-03-19 10:00:00.000000", - down_revision="None", - ) - with open(versions_dir / "001_create_blog_table.py", "w") as f: - f.write(blog_content) - - # Second migration - create vectorizer - vectorizer_content = load_template( - "migrations/002_create_vectorizer_all_fields.py.template", - revision_id="002", - revises="001", - create_date="2024-03-19 10:01:00.000000", - down_revision="001", - indexing=indexing_config.to_python_arg() - ) - with open(versions_dir / "002_create_vectorizer.py", "w") as f: - f.write(vectorizer_content) - - # Run upgrade - upgrade(alembic_config, "head") - - # Verify vectorizer exists - with initialized_engine.connect() as conn: - rows = conn.execute( - text(""" - select pg_catalog.to_jsonb(v) as vectorizer from ai.vectorizer v - """) - ).fetchall() - assert len(rows) == 1 - parsed_vectorizer = Vectorizer.model_validate(rows[0].vectorizer) # type: ignore - assert parsed_vectorizer.target_table == "blog_posts_embedding" - assert isinstance(parsed_vectorizer.config.scheduling, TimescaleScheduling) - assert parsed_vectorizer.config.scheduling.fixed_schedule == True - assert parsed_vectorizer.config.scheduling.schedule_interval == timedelta(hours=1) - - assert isinstance(parsed_vectorizer.config.indexing, DiskANNIndexing) - assert parsed_vectorizer.config.indexing.min_rows == 10 - assert parsed_vectorizer.config.indexing.storage_layout == 'plain' \ No newline at end of file diff --git a/projects/pgai/tests/vectorizer/extensions/test_alembic_fields.py b/projects/pgai/tests/vectorizer/extensions/test_alembic_fields.py new file mode 100644 index 000000000..3c172b2c3 --- /dev/null +++ b/projects/pgai/tests/vectorizer/extensions/test_alembic_fields.py @@ -0,0 +1,394 @@ +from pathlib import Path + +from alembic.command import upgrade +from alembic.config import Config +from sqlalchemy import Engine, text + +from pgai.configuration import ( + ChunkingConfig, + CreateVectorizerParams, + DiskANNIndexingConfig, + HNSWIndexingConfig, + NoIndexingConfig, + NoSchedulingConfig, + OllamaEmbeddingConfig, + OpenAIEmbeddingConfig, + ProcessingConfig, + SchedulingConfig, +) +from pgai.vectorizer import Vectorizer +from tests.vectorizer.extensions.conftest import load_template + +default_embedding_config = OpenAIEmbeddingConfig( + model="text-embedding-3-small", + dimensions=768, + chat_user="test_user", + api_key_name="test_key", +) + +default_chunking_config = ChunkingConfig( + chunk_column="content", + chunk_size=500, + chunk_overlap=10, + separator=" ", + is_separator_regex=True, +) + +default_scheduling_config = SchedulingConfig( + schedule_interval="1h", + initial_start="2022-01-01T00:00:00Z", + fixed_schedule=True, + timezone="UTC", +) + +default_processing_config = ProcessingConfig(batch_size=10, concurrency=5) + +default_indexing_config = DiskANNIndexingConfig( + min_rows=10, + storage_layout="plain", + num_neighbors=5, + search_list_size=10, + max_alpha=0.5, + num_dimensions=10, + num_bits_per_dimension=10, + create_when_queue_empty=False, +) + + +def setup_migrations( + alembic_config: Config, + engine: Engine, + embedding_config: OpenAIEmbeddingConfig | OllamaEmbeddingConfig | None = None, + chunking_config: ChunkingConfig | None = None, + scheduling_config: SchedulingConfig | NoSchedulingConfig | None = None, + processing_config: ProcessingConfig | None = None, + indexing_config: DiskANNIndexingConfig + | HNSWIndexingConfig + | NoIndexingConfig + | None = None, +) -> tuple[ + OpenAIEmbeddingConfig | OllamaEmbeddingConfig, + ChunkingConfig, + SchedulingConfig | NoSchedulingConfig, + ProcessingConfig, + DiskANNIndexingConfig | HNSWIndexingConfig | NoIndexingConfig, +]: + migrations_dir = Path(alembic_config.get_main_option("script_location")) # type: ignore + versions_dir = migrations_dir / "versions" + + with engine.connect() as conn: + conn.execute( + text( + """ + CREATE SCHEMA IF NOT EXISTS timescale; + CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE; + """ + ) + ) + conn.commit() + + if embedding_config is None: + embedding_config = default_embedding_config + + if chunking_config is None: + chunking_config = default_chunking_config + + if scheduling_config is None: + scheduling_config = default_scheduling_config + + if processing_config is None: + processing_config = default_processing_config + + if indexing_config is None: + indexing_config = default_indexing_config + + blog_content = load_template( + "migrations/001_create_blog_table.py.template", + revision_id="001", + revises="", + create_date="2024-03-19 10:00:00.000000", + down_revision="None", + ) + with open(versions_dir / "001_create_blog_table.py", "w") as f: + f.write(blog_content) + + # Second migration - create vectorizer + vectorizer_content = load_template( + "migrations/002_create_vectorizer_all_fields.py.template", + revision_id="002", + revises="001", + create_date="2024-03-19 10:01:00.000000", + down_revision="001", + embedding=embedding_config.to_python_arg(), + chunking=chunking_config.to_python_arg(), + scheduling=scheduling_config.to_python_arg(), + processing=processing_config.to_python_arg(), + indexing=indexing_config.to_python_arg(), + ) + with open(versions_dir / "002_create_vectorizer.py", "w") as f: + f.write(vectorizer_content) + + return ( + embedding_config, + chunking_config, + scheduling_config, + processing_config, + indexing_config, + ) + + +def compare_db_vectorizer_with_configs( + engine: Engine, + embedding_config: OpenAIEmbeddingConfig | OllamaEmbeddingConfig | None = None, + chunking_config: ChunkingConfig | None = None, + scheduling_config: SchedulingConfig | NoSchedulingConfig | None = None, + processing_config: ProcessingConfig | None = None, + indexing_config: DiskANNIndexingConfig + | HNSWIndexingConfig + | NoIndexingConfig + | None = None, +): + if embedding_config is None: + embedding_config = default_embedding_config + + if chunking_config is None: + chunking_config = default_chunking_config + + if scheduling_config is None: + scheduling_config = default_scheduling_config + + if processing_config is None: + processing_config = default_processing_config + + if indexing_config is None: + indexing_config = default_indexing_config + with engine.connect() as conn: + rows = conn.execute( + text(""" + select pg_catalog.to_jsonb(v) as vectorizer from ai.vectorizer v + """) + ).fetchall() + assert len(rows) == 1 + parsed_vectorizer = Vectorizer.model_validate(rows[0].vectorizer) # type: ignore + params = CreateVectorizerParams.from_db_config(parsed_vectorizer) + + assert params.embedding == embedding_config + assert params.chunking == chunking_config + assert params.processing == processing_config + assert params.indexing == indexing_config + if isinstance(scheduling_config, NoSchedulingConfig): + assert params.scheduling == scheduling_config + # Note that a scheduling config can currently not be compared because + # the representation of time in the db is different + # from the representation in the config object + + +def test_vectorizer_migration_default_fields( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with a bunch of fields""" + setup_migrations(alembic_config, initialized_engine) + upgrade(alembic_config, "head") + + compare_db_vectorizer_with_configs(initialized_engine) + + +def test_vectorizer_migration_ollama( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with a bunch of fields""" + embedding_config = OllamaEmbeddingConfig( + model="nomic-embed-text", + dimensions=100, + base_url="http://localhost:8000", + truncate=False, + keep_alive="1h", + ) + + setup_migrations( + alembic_config, initialized_engine, embedding_config=embedding_config + ) + upgrade(alembic_config, "head") + + compare_db_vectorizer_with_configs( + initialized_engine, embedding_config=embedding_config + ) + + +def test_vectorizer_migration_chunking_recursive( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with recursive character text splitter configuration""" + chunking_config = ChunkingConfig( + chunk_column="content", + chunk_size=1000, + chunk_overlap=100, + separator=["\n\n", "\n", ".", "!", "?"], + is_separator_regex=True, + ) + + setup_migrations( + alembic_config, initialized_engine, chunking_config=chunking_config + ) + upgrade(alembic_config, "head") + + compare_db_vectorizer_with_configs( + initialized_engine, + chunking_config=chunking_config, + ) + + +def test_vectorizer_migration_chunking_simple( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with simple character text splitter configuration""" + chunking_config = ChunkingConfig( + chunk_column="content", + chunk_size=200, # Small chunk size + chunk_overlap=50, + separator=" ", # Single simple separator + is_separator_regex=False, + ) + + setup_migrations( + alembic_config, initialized_engine, chunking_config=chunking_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, chunking_config=chunking_config + ) + + +def test_vectorizer_migration_chunking_regex( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with regex separator configuration""" + chunking_config = ChunkingConfig( + chunk_column="content", + chunk_size=800, + chunk_overlap=200, + separator=r"\s+", # Regex for whitespace + is_separator_regex=True, + ) + + setup_migrations( + alembic_config, initialized_engine, chunking_config=chunking_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, chunking_config=chunking_config + ) + + +def test_vectorizer_migration_hnsw_cosine( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with HNSW cosine indexing""" + indexing_config = HNSWIndexingConfig( + min_rows=50000, + opclass="vector_cosine_ops", + m=16, + ef_construction=64, + create_when_queue_empty=True, + ) + + setup_migrations( + alembic_config, initialized_engine, indexing_config=indexing_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, indexing_config=indexing_config + ) + + +def test_vectorizer_migration_hnsw_l1( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with HNSW L1 indexing""" + indexing_config = HNSWIndexingConfig( + min_rows=75000, + opclass="vector_l1_ops", + m=32, + ef_construction=128, + create_when_queue_empty=False, + ) + + setup_migrations( + alembic_config, initialized_engine, indexing_config=indexing_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, indexing_config=indexing_config + ) + + +def test_vectorizer_migration_diskann_minimal( + alembic_config: Config, initialized_engine: Engine +): + indexing_config = DiskANNIndexingConfig() + + setup_migrations( + alembic_config, initialized_engine, indexing_config=indexing_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, indexing_config=indexing_config + ) + + +def test_vectorizer_migration_no_scheduling( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with no scheduling""" + scheduling_config = NoSchedulingConfig() + indexing_config = NoIndexingConfig() + + setup_migrations( + alembic_config, + initialized_engine, + scheduling_config=scheduling_config, + indexing_config=indexing_config, + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, + scheduling_config=scheduling_config, + indexing_config=indexing_config, + ) + + +def test_vectorizer_migration_custom_schedule( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with custom scheduling""" + scheduling_config = SchedulingConfig( + schedule_interval="30m", + initial_start="2024-03-20T00:00:00Z", + fixed_schedule=True, + timezone="America/New_York", + ) + + setup_migrations( + alembic_config, initialized_engine, scheduling_config=scheduling_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, scheduling_config=scheduling_config + ) + + +def test_vectorizer_migration_flexible_schedule( + alembic_config: Config, initialized_engine: Engine +): + """Test vectorizer creation with flexible scheduling""" + scheduling_config = SchedulingConfig( + schedule_interval="2h", fixed_schedule=False, timezone="UTC" + ) + + setup_migrations( + alembic_config, initialized_engine, scheduling_config=scheduling_config + ) + upgrade(alembic_config, "head") + compare_db_vectorizer_with_configs( + initialized_engine, scheduling_config=scheduling_config + ) diff --git a/projects/pgai/tests/vectorizer/extensions/test_sqlalchemy.py b/projects/pgai/tests/vectorizer/extensions/test_sqlalchemy.py index ce3adb113..408ac20e5 100644 --- a/projects/pgai/tests/vectorizer/extensions/test_sqlalchemy.py +++ b/projects/pgai/tests/vectorizer/extensions/test_sqlalchemy.py @@ -25,7 +25,9 @@ class BlogPost(Base): content = Column(Text, nullable=False) content_embeddings = VectorizerField( - embedding=OpenAIEmbeddingConfig(model="text-embedding-3-small", dimensions=768), + embedding=OpenAIEmbeddingConfig( + model="text-embedding-3-small", dimensions=768 + ), chunking=ChunkingConfig( chunk_column="content", chunk_size=500, chunk_overlap=50 ),