diff --git a/.github/workflows/test_destination_lancedb.yml b/.github/workflows/test_destination_lancedb.yml new file mode 100644 index 0000000000..02b5ef66eb --- /dev/null +++ b/.github/workflows/test_destination_lancedb.yml @@ -0,0 +1,81 @@ +name: dest | lancedb + +on: + pull_request: + branches: + - master + - devel + workflow_dispatch: + schedule: + - cron: '0 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} + + RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 + RUNTIME__LOG_LEVEL: ERROR + RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }} + + ACTIVE_DESTINATIONS: "[\"lancedb\"]" + ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + +jobs: + get_docs_changes: + name: docs changes + uses: ./.github/workflows/get_docs_changes.yml + if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}} + + run_loader: + name: dest | lancedb tests + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + defaults: + run: + shell: bash + runs-on: "ubuntu-latest" + + steps: + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.11.x" + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp + + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + + - name: Install dependencies + run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline + + - name: Install embedding provider dependencies + run: poetry run pip install openai + + - run: | + poetry run pytest tests/load -m "essential" + name: Run essential tests Linux + if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}} + + - run: | + poetry run pytest tests/load + name: Run all tests Linux + if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}} diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index cb6417a4ab..2c51695714 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -61,7 +61,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres --with docs,sentry-sdk --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow - name: create secrets.toml for examples run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index 302de24a6b..0546d16bcd 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -8,6 +8,7 @@ from dlt.destinations.impl.athena.factory import athena from dlt.destinations.impl.redshift.factory import redshift from dlt.destinations.impl.qdrant.factory import qdrant +from dlt.destinations.impl.lancedb.factory import lancedb from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate from dlt.destinations.impl.destination.factory import destination @@ -28,6 +29,7 @@ "athena", "redshift", "qdrant", + "lancedb", "motherduck", "weaviate", "synapse", diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index 42d4879653..0cf04b7b59 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -2,6 +2,7 @@ from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter +from dlt.destinations.impl.lancedb import lancedb_adapter from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter from dlt.destinations.impl.clickhouse.clickhouse_adapter import clickhouse_adapter @@ -10,6 +11,7 @@ __all__ = [ "weaviate_adapter", "qdrant_adapter", + "lancedb_adapter", "bigquery_adapter", "synapse_adapter", "clickhouse_adapter", diff --git a/dlt/destinations/impl/lancedb/__init__.py b/dlt/destinations/impl/lancedb/__init__.py new file mode 100644 index 0000000000..bc6974b072 --- /dev/null +++ b/dlt/destinations/impl/lancedb/__init__.py @@ -0,0 +1 @@ +from dlt.destinations.impl.lancedb.lancedb_adapter import lancedb_adapter diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py new file mode 100644 index 0000000000..ba3a8b49d9 --- /dev/null +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -0,0 +1,111 @@ +import dataclasses +from typing import Optional, Final, Literal, ClassVar, List + +from dlt.common.configuration import configspec +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + CredentialsConfiguration, +) +from dlt.common.destination.reference import DestinationClientDwhConfiguration +from dlt.common.typing import TSecretStrValue +from dlt.common.utils import digest128 + + +@configspec +class LanceDBCredentials(CredentialsConfiguration): + uri: Optional[str] = ".lancedb" + """LanceDB database URI. Defaults to local, on-disk instance. + + The available schemas are: + + - `/path/to/database` - local database. + - `db://host:port` - remote database (LanceDB cloud). + """ + api_key: Optional[TSecretStrValue] = None + """API key for the remote connections (LanceDB cloud).""" + embedding_model_provider_api_key: Optional[str] = None + """API key for the embedding model provider.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "uri", + "api_key", + "embedding_model_provider_api_key", + ] + + +@configspec +class LanceDBClientOptions(BaseConfiguration): + max_retries: Optional[int] = 3 + """`EmbeddingFunction` class wraps the calls for source and query embedding + generation inside a rate limit handler that retries the requests with exponential + backoff after successive failures. + + You can tune it by setting it to a different number, or disable it by setting it to 0.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "max_retries", + ] + + +TEmbeddingProvider = Literal[ + "gemini-text", + "bedrock-text", + "cohere", + "gte-text", + "imagebind", + "instructor", + "open-clip", + "openai", + "sentence-transformers", + "huggingface", + "colbert", +] + + +@configspec +class LanceDBClientConfiguration(DestinationClientDwhConfiguration): + destination_type: Final[str] = dataclasses.field( # type: ignore + default="LanceDB", init=False, repr=False, compare=False + ) + credentials: LanceDBCredentials = None + dataset_separator: str = "___" + """Character for the dataset separator.""" + dataset_name: Final[Optional[str]] = dataclasses.field( # type: ignore + default=None, init=False, repr=False, compare=False + ) + + options: Optional[LanceDBClientOptions] = None + """LanceDB client options.""" + + embedding_model_provider: TEmbeddingProvider = "cohere" + """Embedding provider used for generating embeddings. Default is "cohere". You can find the full list of + providers at https://github.com/lancedb/lancedb/tree/main/python/python/lancedb/embeddings as well as + https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/.""" + embedding_model: str = "embed-english-v3.0" + """The model used by the embedding provider for generating embeddings. + Check with the embedding provider which options are available. + Reference https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/.""" + embedding_model_dimensions: Optional[int] = None + """The dimensions of the embeddings generated. In most cases it will be automatically inferred, by LanceDB, + but it is configurable in rare cases. + + Make sure it corresponds with the associated embedding model's dimensionality.""" + vector_field_name: str = "vector__" + """Name of the special field to store the vector embeddings.""" + id_field_name: str = "id__" + """Name of the special field to manage deduplication.""" + sentinel_table_name: str = "dltSentinelTable" + """Name of the sentinel table that encapsulates datasets. Since LanceDB has no + concept of schemas, this table serves as a proxy to group related dlt tables together.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "embedding_model", + "embedding_model_provider", + ] + + def fingerprint(self) -> str: + """Returns a fingerprint of a connection string.""" + + if self.credentials and self.credentials.uri: + return digest128(self.credentials.uri) + return "" diff --git a/dlt/destinations/impl/lancedb/exceptions.py b/dlt/destinations/impl/lancedb/exceptions.py new file mode 100644 index 0000000000..35b86ce76c --- /dev/null +++ b/dlt/destinations/impl/lancedb/exceptions.py @@ -0,0 +1,30 @@ +from functools import wraps +from typing import ( + Any, +) + +from lancedb.exceptions import MissingValueError, MissingColumnError # type: ignore + +from dlt.common.destination.exceptions import ( + DestinationUndefinedEntity, + DestinationTerminalException, +) +from dlt.common.destination.reference import JobClientBase +from dlt.common.typing import TFun + + +def lancedb_error(f: TFun) -> TFun: + @wraps(f) + def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: + try: + return f(self, *args, **kwargs) + except ( + FileNotFoundError, + MissingValueError, + MissingColumnError, + ) as status_ex: + raise DestinationUndefinedEntity(status_ex) from status_ex + except Exception as e: + raise DestinationTerminalException(e) from e + + return _wrap # type: ignore[return-value] diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py new file mode 100644 index 0000000000..f2e17168b9 --- /dev/null +++ b/dlt/destinations/impl/lancedb/factory.py @@ -0,0 +1,53 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBCredentials, + LanceDBClientConfiguration, +) + + +if t.TYPE_CHECKING: + from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + +class lancedb(Destination[LanceDBClientConfiguration, "LanceDBClient"]): + spec = LanceDBClientConfiguration + + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + + caps.decimal_precision = (38, 18) + caps.timestamp_precision = 6 + + return caps + + @property + def client_class(self) -> t.Type["LanceDBClient"]: + from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + return LanceDBClient + + def __init__( + self, + credentials: t.Union[LanceDBCredentials, t.Dict[str, t.Any]] = None, + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + super().__init__( + credentials=credentials, + destination_name=destination_name, + environment=environment, + **kwargs, + ) diff --git a/dlt/destinations/impl/lancedb/lancedb_adapter.py b/dlt/destinations/impl/lancedb/lancedb_adapter.py new file mode 100644 index 0000000000..bb33632b48 --- /dev/null +++ b/dlt/destinations/impl/lancedb/lancedb_adapter.py @@ -0,0 +1,58 @@ +from typing import Any + +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.destinations.utils import ensure_resource +from dlt.extract import DltResource + + +VECTORIZE_HINT = "x-lancedb-embed" + + +def lancedb_adapter( + data: Any, + embed: TColumnNames = None, +) -> DltResource: + """Prepares data for the LanceDB destination by specifying which columns should be embedded. + + Args: + data (Any): The data to be transformed. It can be raw data or an instance + of DltResource. If raw data, the function wraps it into a DltResource + object. + embed (TColumnNames, optional): Specify columns to generate embeddings for. + It can be a single column name as a string, or a list of column names. + + Returns: + DltResource: A resource with applied LanceDB-specific hints. + + Raises: + ValueError: If input for `embed` invalid or empty. + + Examples: + >>> data = [{"name": "Marcel", "description": "Moonbase Engineer"}] + >>> lancedb_adapter(data, embed="description") + [DltResource with hints applied] + """ + resource = ensure_resource(data) + + column_hints: TTableSchemaColumns = {} + + if embed: + if isinstance(embed, str): + embed = [embed] + if not isinstance(embed, list): + raise ValueError( + "'embed' must be a list of column names or a single column name as a string." + ) + + for column_name in embed: + column_hints[column_name] = { + "name": column_name, + VECTORIZE_HINT: True, # type: ignore[misc] + } + + if not column_hints: + raise ValueError("A value for 'embed' must be specified.") + else: + resource.apply_hints(columns=column_hints) + + return resource diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py new file mode 100644 index 0000000000..128e2c7e7e --- /dev/null +++ b/dlt/destinations/impl/lancedb/lancedb_client.py @@ -0,0 +1,767 @@ +import uuid +from types import TracebackType +from typing import ( + ClassVar, + List, + Any, + cast, + Union, + Tuple, + Iterable, + Type, + Optional, + Dict, + Sequence, +) + +import lancedb # type: ignore +import pyarrow as pa +from lancedb import DBConnection +from lancedb.embeddings import EmbeddingFunctionRegistry, TextEmbeddingFunction # type: ignore +from lancedb.query import LanceQueryBuilder # type: ignore +from lancedb.table import Table # type: ignore +from numpy import ndarray +from pyarrow import Array, ChunkedArray, ArrowInvalid + +from dlt.common import json, pendulum, logger +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + DestinationUndefinedEntity, + DestinationTransientException, + DestinationTerminalException, +) +from dlt.common.destination.reference import ( + JobClientBase, + WithStateSync, + LoadJob, + StorageSchemaInfo, + StateInfo, + TLoadJobState, +) +from dlt.common.pendulum import timedelta +from dlt.common.schema import Schema, TTableSchema, TSchemaTables +from dlt.common.schema.typing import ( + TColumnType, + TTableFormat, + TTableSchemaColumns, + TWriteDisposition, +) +from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.storages import FileStorage +from dlt.common.typing import DictStrAny +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBClientConfiguration, +) +from dlt.destinations.impl.lancedb.exceptions import ( + lancedb_error, +) +from dlt.destinations.impl.lancedb.lancedb_adapter import VECTORIZE_HINT +from dlt.destinations.impl.lancedb.schema import ( + make_arrow_field_schema, + make_arrow_table_schema, + TArrowSchema, + NULL_SCHEMA, + TArrowField, +) +from dlt.destinations.impl.lancedb.utils import ( + list_merge_identifiers, + generate_uuid, + set_non_standard_providers_environment_variables, +) +from dlt.destinations.job_impl import EmptyLoadJob +from dlt.destinations.type_mapping import TypeMapper + + +TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"} +UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()} + + +class LanceDBTypeMapper(TypeMapper): + sct_to_unbound_dbt = { + "text": pa.string(), + "double": pa.float64(), + "bool": pa.bool_(), + "bigint": pa.int64(), + "binary": pa.binary(), + "date": pa.date32(), + "complex": pa.string(), + } + + sct_to_dbt = {} + + dbt_to_sct = { + pa.string(): "text", + pa.float64(): "double", + pa.bool_(): "bool", + pa.int64(): "bigint", + pa.binary(): "binary", + pa.date32(): "date", + } + + def to_db_decimal_type( + self, precision: Optional[int], scale: Optional[int] + ) -> pa.Decimal128Type: + precision, scale = self.decimal_precision(precision, scale) + return pa.decimal128(precision, scale) + + def to_db_datetime_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> pa.TimestampType: + unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision] + return pa.timestamp(unit, "UTC") + + def to_db_time_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> pa.Time64Type: + unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision] + return pa.time64(unit) + + def from_db_type( + self, + db_type: pa.DataType, + precision: Optional[int] = None, + scale: Optional[int] = None, + ) -> TColumnType: + if isinstance(db_type, pa.TimestampType): + return dict( + data_type="timestamp", + precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit], + scale=scale, + ) + if isinstance(db_type, pa.Time64Type): + return dict( + data_type="time", + precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit], + scale=scale, + ) + if isinstance(db_type, pa.Decimal128Type): + precision, scale = db_type.precision, db_type.scale + if (precision, scale) == self.capabilities.wei_precision: + return cast(TColumnType, dict(data_type="wei")) + return dict(data_type="decimal", precision=precision, scale=scale) + return super().from_db_type(db_type, precision, scale) + + +def upload_batch( + records: List[DictStrAny], + /, + *, + db_client: DBConnection, + table_name: str, + write_disposition: TWriteDisposition, + id_field_name: Optional[str] = None, +) -> None: + """Inserts records into a LanceDB table with automatic embedding computation. + + Args: + records: The data to be inserted as payload. + db_client: The LanceDB client connection. + table_name: The name of the table to insert into. + id_field_name: The name of the ID field for update/merge operations. + write_disposition: The write disposition - one of 'skip', 'append', 'replace', 'merge'. + + Raises: + ValueError: If the write disposition is unsupported, or `id_field_name` is not + provided for update/merge operations. + """ + + try: + tbl = db_client.open_table(table_name) + tbl.checkout_latest() + except FileNotFoundError as e: + raise DestinationTransientException( + "Couldn't open lancedb database. Batch WILL BE RETRIED" + ) from e + + try: + if write_disposition in ("append", "skip"): + tbl.add(records) + elif write_disposition == "replace": + tbl.add(records, mode="overwrite") + elif write_disposition == "merge": + if not id_field_name: + raise ValueError("To perform a merge update, 'id_field_name' must be specified.") + tbl.merge_insert( + id_field_name + ).when_matched_update_all().when_not_matched_insert_all().execute(records) + else: + raise DestinationTerminalException( + f"Unsupported write disposition {write_disposition} for LanceDB Destination - batch" + " failed AND WILL **NOT** BE RETRIED." + ) + except ArrowInvalid as e: + raise DestinationTerminalException( + "Python and Arrow datatype mismatch - batch failed AND WILL **NOT** BE RETRIED." + ) from e + + +class LanceDBClient(JobClientBase, WithStateSync): + """LanceDB destination handler.""" + + model_func: TextEmbeddingFunction + + def __init__( + self, + schema: Schema, + config: LanceDBClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) + self.config: LanceDBClientConfiguration = config + self.db_client: DBConnection = lancedb.connect( + uri=self.config.credentials.uri, + api_key=self.config.credentials.api_key, + read_consistency_interval=timedelta(0), + ) + self.registry = EmbeddingFunctionRegistry.get_instance() + self.type_mapper = LanceDBTypeMapper(self.capabilities) + self.sentinel_table_name = config.sentinel_table_name + + embedding_model_provider = self.config.embedding_model_provider + + # LanceDB doesn't provide a standardized way to set API keys across providers. + # Some use ENV variables and others allow passing api key as an argument. + # To account for this, we set provider environment variable as well. + set_non_standard_providers_environment_variables( + embedding_model_provider, + self.config.credentials.embedding_model_provider_api_key, + ) + # Use the monkey-patched implementation if openai was chosen. + if embedding_model_provider == "openai": + from dlt.destinations.impl.lancedb.models import PatchedOpenAIEmbeddings + + self.model_func = PatchedOpenAIEmbeddings( + max_retries=self.config.options.max_retries, + api_key=self.config.credentials.api_key, + ) + else: + self.model_func = self.registry.get(embedding_model_provider).create( + name=self.config.embedding_model, + max_retries=self.config.options.max_retries, + api_key=self.config.credentials.api_key, + ) + + self.vector_field_name = self.config.vector_field_name + self.id_field_name = self.config.id_field_name + + @property + def dataset_name(self) -> str: + return self.config.normalize_dataset_name(self.schema) + + @property + def sentinel_table(self) -> str: + return self.make_qualified_table_name(self.sentinel_table_name) + + def make_qualified_table_name(self, table_name: str) -> str: + return ( + f"{self.dataset_name}{self.config.dataset_separator}{table_name}" + if self.dataset_name + else table_name + ) + + def get_table_schema(self, table_name: str) -> TArrowSchema: + schema_table: Table = self.db_client.open_table(table_name) + schema_table.checkout_latest() + schema = schema_table.schema + return cast( + TArrowSchema, + schema, + ) + + @lancedb_error + def create_table(self, table_name: str, schema: TArrowSchema, mode: str = "create") -> Table: + """Create a LanceDB Table from the provided LanceModel or PyArrow schema. + + Args: + schema: The table schema to create. + table_name: The name of the table to create. + mode (): The mode to use when creating the table. Can be either "create" or "overwrite". + By default, if the table already exists, an exception is raised. + If you want to overwrite the table, use mode="overwrite". + """ + return self.db_client.create_table(table_name, schema=schema, mode=mode) + + def delete_table(self, table_name: str) -> None: + """Delete a LanceDB table. + + Args: + table_name: The name of the table to delete. + """ + self.db_client.drop_table(table_name) + + def query_table( + self, + table_name: str, + query: Union[ + List[Any], ndarray[Any, Any], Array, ChunkedArray, str, Tuple[Any], None + ] = None, + ) -> LanceQueryBuilder: + """Query a LanceDB table. + + Args: + table_name: The name of the table to query. + query: The targeted vector to search for. + + Returns: + A LanceDB query builder. + """ + query_table: Table = self.db_client.open_table(table_name) + query_table.checkout_latest() + return query_table.search(query=query) + + @lancedb_error + def _get_table_names(self) -> List[str]: + """Return all tables in the dataset, excluding the sentinel table.""" + if self.dataset_name: + prefix = f"{self.dataset_name}{self.config.dataset_separator}" + table_names = [ + table_name + for table_name in self.db_client.table_names() + if table_name.startswith(prefix) + ] + else: + table_names = self.db_client.table_names() + + return [table_name for table_name in table_names if table_name != self.sentinel_table] + + @lancedb_error + def drop_storage(self) -> None: + """Drop the dataset from the LanceDB instance. + + Deletes all tables in the dataset and all data, as well as sentinel table associated with them. + + If the dataset name was not provided, it deletes all the tables in the current schema. + """ + for table_name in self._get_table_names(): + self.db_client.drop_table(table_name) + + self._delete_sentinel_table() + + @lancedb_error + def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: + if not self.is_storage_initialized(): + self._create_sentinel_table() + elif truncate_tables: + for table_name in truncate_tables: + fq_table_name = self.make_qualified_table_name(table_name) + if not self.table_exists(fq_table_name): + continue + schema = self.get_table_schema(fq_table_name) + self.db_client.drop_table(fq_table_name) + self.create_table( + table_name=fq_table_name, + schema=schema, + ) + + @lancedb_error + def is_storage_initialized(self) -> bool: + return self.table_exists(self.sentinel_table) + + def _create_sentinel_table(self) -> Table: + """Create an empty table to indicate that the storage is initialized.""" + return self.create_table(schema=NULL_SCHEMA, table_name=self.sentinel_table) + + def _delete_sentinel_table(self) -> None: + """Delete the sentinel table.""" + self.db_client.drop_table(self.sentinel_table) + + @lancedb_error + def update_stored_schema( + self, + only_tables: Iterable[str] = None, + expected_update: TSchemaTables = None, + ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) + applied_update: TSchemaTables = {} + + try: + schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) + except DestinationUndefinedEntity: + schema_info = None + + if schema_info is None: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} " + "not found in the storage. upgrading" + ) + self._execute_schema_update(only_tables) + else: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} " + f"inserted at {schema_info.inserted_at} found " + "in storage, no upgrade required" + ) + return applied_update + + def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: + table_schema: TTableSchemaColumns = {} + + try: + fq_table_name = self.make_qualified_table_name(table_name) + + table: Table = self.db_client.open_table(fq_table_name) + table.checkout_latest() + arrow_schema: TArrowSchema = table.schema + except FileNotFoundError: + return False, table_schema + + field: TArrowField + for field in arrow_schema: + name = self.schema.naming.normalize_identifier(field.name) + print(field.type) + print(field.name) + table_schema[name] = { + "name": name, + **self.type_mapper.from_db_type(field.type), + } + return True, table_schema + + @lancedb_error + def add_table_fields( + self, table_name: str, field_schemas: List[TArrowField] + ) -> Optional[Table]: + """Add multiple fields to the LanceDB table at once. + + Args: + table_name: The name of the table to create the fields on. + field_schemas: The list of fields to create. + """ + table: Table = self.db_client.open_table(table_name) + table.checkout_latest() + arrow_table = table.to_arrow() + + # Check if any of the new fields already exist in the table. + existing_fields = set(arrow_table.schema.names) + new_fields = [field for field in field_schemas if field.name not in existing_fields] + + if not new_fields: + # All fields already present, skip. + return None + + null_arrays = [pa.nulls(len(arrow_table), type=field.type) for field in new_fields] + + for field, null_array in zip(new_fields, null_arrays): + arrow_table = arrow_table.append_column(field, null_array) + + try: + return self.db_client.create_table(table_name, arrow_table, mode="overwrite") + except OSError: + # Error occurred while creating the table, skip. + return None + + def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + for table_name in only_tables or self.schema.tables: + exists, existing_columns = self.get_storage_table(table_name) + new_columns = self.schema.get_new_table_columns(table_name, existing_columns) + print(table_name) + print(new_columns) + embedding_fields: List[str] = get_columns_names_with_prop( + self.schema.get_table(table_name), VECTORIZE_HINT + ) + logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") + if len(new_columns) > 0: + if exists: + field_schemas: List[TArrowField] = [ + make_arrow_field_schema(column["name"], column, self.type_mapper) + for column in new_columns + ] + fq_table_name = self.make_qualified_table_name(table_name) + self.add_table_fields(fq_table_name, field_schemas) + else: + if table_name not in self.schema.dlt_table_names(): + embedding_fields = get_columns_names_with_prop( + self.schema.get_table(table_name=table_name), VECTORIZE_HINT + ) + vector_field_name = self.vector_field_name + id_field_name = self.id_field_name + embedding_model_func = self.model_func + embedding_model_dimensions = self.config.embedding_model_dimensions + else: + embedding_fields = None + vector_field_name = None + id_field_name = None + embedding_model_func = None + embedding_model_dimensions = None + + table_schema: TArrowSchema = make_arrow_table_schema( + table_name, + schema=self.schema, + type_mapper=self.type_mapper, + embedding_fields=embedding_fields, + embedding_model_func=embedding_model_func, + embedding_model_dimensions=embedding_model_dimensions, + vector_field_name=vector_field_name, + id_field_name=id_field_name, + ) + fq_table_name = self.make_qualified_table_name(table_name) + self.create_table(fq_table_name, table_schema) + + self.update_schema_in_storage() + + @lancedb_error + def update_schema_in_storage(self) -> None: + records = [ + { + self.schema.naming.normalize_identifier("version"): self.schema.version, + self.schema.naming.normalize_identifier( + "engine_version" + ): self.schema.ENGINE_VERSION, + self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()), + self.schema.naming.normalize_identifier("schema_name"): self.schema.name, + self.schema.naming.normalize_identifier( + "version_hash" + ): self.schema.stored_version_hash, + self.schema.naming.normalize_identifier("schema"): json.dumps( + self.schema.to_dict() + ), + } + ] + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + write_disposition = self.schema.get_table(self.schema.version_table_name).get( + "write_disposition" + ) + print("UPLOAD") + upload_batch( + records, + db_client=self.db_client, + table_name=fq_version_table_name, + write_disposition=write_disposition, + ) + + @lancedb_error + def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: + """Retrieves the latest completed state for a pipeline.""" + fq_state_table_name = self.make_qualified_table_name(self.schema.state_table_name) + fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name) + + state_table_: Table = self.db_client.open_table(fq_state_table_name) + state_table_.checkout_latest() + + loads_table_: Table = self.db_client.open_table(fq_loads_table_name) + loads_table_.checkout_latest() + + # normalize property names + p_load_id = self.schema.naming.normalize_identifier("load_id") + p_dlt_load_id = self.schema.naming.normalize_identifier("_dlt_load_id") + p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name") + p_status = self.schema.naming.normalize_identifier("status") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_state = self.schema.naming.normalize_identifier("state") + p_created_at = self.schema.naming.normalize_identifier("created_at") + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + + # Read the tables into memory as Arrow tables, with pushdown predicates, so we pull as less + # data into memory as possible. + state_table = ( + state_table_.search() + .where(f"`{p_pipeline_name}` = '{pipeline_name}'", prefilter=True) + .to_arrow() + ) + loads_table = loads_table_.search().where(f"`{p_status}` = 0", prefilter=True).to_arrow() + + # Join arrow tables in-memory. + joined_table: pa.Table = state_table.join( + loads_table, keys=p_dlt_load_id, right_keys=p_load_id, join_type="inner" + ).sort_by([(p_dlt_load_id, "descending")]) + + if joined_table.num_rows == 0: + return None + + state = joined_table.take([0]).to_pylist()[0] + return StateInfo( + version=state[p_version], + engine_version=state[p_engine_version], + pipeline_name=state[p_pipeline_name], + state=state[p_state], + created_at=pendulum.instance(state[p_created_at]), + version_hash=state[p_version_hash], + _dlt_load_id=state[p_dlt_load_id], + ) + + @lancedb_error + def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + + version_table: Table = self.db_client.open_table(fq_version_table_name) + version_table.checkout_latest() + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_schema = self.schema.naming.normalize_identifier("schema") + + try: + schemas = ( + version_table.search().where( + f'`{p_version_hash}` = "{schema_hash}"', prefilter=True + ) + ).to_list() + + # LanceDB's ORDER BY clause doesn't seem to work. + # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341 + most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0] + return StorageSchemaInfo( + version_hash=most_recent_schema[p_version_hash], + schema_name=most_recent_schema[p_schema_name], + version=most_recent_schema[p_version], + engine_version=most_recent_schema[p_engine_version], + inserted_at=most_recent_schema[p_inserted_at], + schema=most_recent_schema[p_schema], + ) + except IndexError: + return None + + @lancedb_error + def get_stored_schema(self) -> Optional[StorageSchemaInfo]: + """Retrieves newest schema from destination storage.""" + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + + version_table: Table = self.db_client.open_table(fq_version_table_name) + version_table.checkout_latest() + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_schema = self.schema.naming.normalize_identifier("schema") + + try: + schemas = ( + version_table.search().where( + f'`{p_schema_name}` = "{self.schema.name}"', prefilter=True + ) + ).to_list() + + # LanceDB's ORDER BY clause doesn't seem to work. + # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341 + most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0] + return StorageSchemaInfo( + version_hash=most_recent_schema[p_version_hash], + schema_name=most_recent_schema[p_schema_name], + version=most_recent_schema[p_version], + engine_version=most_recent_schema[p_engine_version], + inserted_at=most_recent_schema[p_inserted_at], + schema=most_recent_schema[p_schema], + ) + except IndexError: + return None + + def __exit__( + self, + exc_type: Type[BaseException], + exc_val: BaseException, + exc_tb: TracebackType, + ) -> None: + pass + + def __enter__(self) -> "LanceDBClient": + return self + + @lancedb_error + def complete_load(self, load_id: str) -> None: + records = [ + { + self.schema.naming.normalize_identifier("load_id"): load_id, + self.schema.naming.normalize_identifier("schema_name"): self.schema.name, + self.schema.naming.normalize_identifier("status"): 0, + self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()), + self.schema.naming.normalize_identifier( + "schema_version_hash" + ): None, # Payload schema must match the target schema. + } + ] + fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name) + write_disposition = self.schema.get_table(self.schema.loads_table_name).get( + "write_disposition" + ) + upload_batch( + records, + db_client=self.db_client, + table_name=fq_loads_table_name, + write_disposition=write_disposition, + ) + + def restore_file_load(self, file_path: str) -> LoadJob: + return EmptyLoadJob.from_file_path(file_path, "completed") + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + return LoadLanceDBJob( + self.schema, + table, + file_path, + type_mapper=self.type_mapper, + db_client=self.db_client, + client_config=self.config, + model_func=self.model_func, + fq_table_name=self.make_qualified_table_name(table["name"]), + ) + + def table_exists(self, table_name: str) -> bool: + return table_name in self.db_client.table_names() + + +class LoadLanceDBJob(LoadJob): + arrow_schema: TArrowSchema + + def __init__( + self, + schema: Schema, + table_schema: TTableSchema, + local_path: str, + type_mapper: LanceDBTypeMapper, + db_client: DBConnection, + client_config: LanceDBClientConfiguration, + model_func: TextEmbeddingFunction, + fq_table_name: str, + ) -> None: + file_name = FileStorage.get_file_name_from_file_path(local_path) + super().__init__(file_name) + self.schema: Schema = schema + self.table_schema: TTableSchema = table_schema + self.db_client: DBConnection = db_client + self.type_mapper: TypeMapper = type_mapper + self.table_name: str = table_schema["name"] + self.fq_table_name: str = fq_table_name + self.unique_identifiers: Sequence[str] = list_merge_identifiers(table_schema) + self.embedding_fields: List[str] = get_columns_names_with_prop(table_schema, VECTORIZE_HINT) + self.embedding_model_func: TextEmbeddingFunction = model_func + self.embedding_model_dimensions: int = client_config.embedding_model_dimensions + self.id_field_name: str = client_config.id_field_name + self.write_disposition: TWriteDisposition = cast( + TWriteDisposition, self.table_schema.get("write_disposition", "append") + ) + + with FileStorage.open_zipsafe_ro(local_path) as f: + records: List[DictStrAny] = [json.loads(line) for line in f] + + if self.table_schema not in self.schema.dlt_tables(): + for record in records: + # Add reserved ID fields. + uuid_id = ( + generate_uuid(record, self.unique_identifiers, self.fq_table_name) + if self.unique_identifiers + else str(uuid.uuid4()) + ) + record.update({self.id_field_name: uuid_id}) + + # LanceDB expects all fields in the target arrow table to be present in the data payload. + # We add and set these missing fields, that are fields not present in the target schema, to NULL. + missing_fields = set(self.table_schema["columns"]) - set(record) + for field in missing_fields: + record[field] = None + + upload_batch( + records, + db_client=db_client, + table_name=self.fq_table_name, + write_disposition=self.write_disposition, + id_field_name=self.id_field_name, + ) + + def state(self) -> TLoadJobState: + return "completed" + + def exception(self) -> str: + raise NotImplementedError() diff --git a/dlt/destinations/impl/lancedb/models.py b/dlt/destinations/impl/lancedb/models.py new file mode 100644 index 0000000000..d90adb62bd --- /dev/null +++ b/dlt/destinations/impl/lancedb/models.py @@ -0,0 +1,34 @@ +from typing import Union, List + +import numpy as np +from lancedb.embeddings import OpenAIEmbeddings # type: ignore +from lancedb.embeddings.registry import register # type: ignore +from lancedb.embeddings.utils import TEXT # type: ignore + + +@register("openai_patched") +class PatchedOpenAIEmbeddings(OpenAIEmbeddings): + EMPTY_STRING_PLACEHOLDER: str = "___EMPTY___" + + def sanitize_input(self, texts: TEXT) -> Union[List[str], np.ndarray]: # type: ignore[type-arg] + """ + Replace empty strings with a placeholder value. + """ + + sanitized_texts = super().sanitize_input(texts) + return [self.EMPTY_STRING_PLACEHOLDER if item == "" else item for item in sanitized_texts] + + def generate_embeddings( + self, + texts: Union[List[str], np.ndarray], # type: ignore[type-arg] + ) -> List[np.array]: # type: ignore[valid-type] + """ + Generate embeddings, treating the placeholder as an empty result. + """ + embeddings: List[np.array] = super().generate_embeddings(texts) # type: ignore[valid-type] + + for i, text in enumerate(texts): + if text == self.EMPTY_STRING_PLACEHOLDER: + embeddings[i] = np.zeros(self.ndims()) + + return embeddings diff --git a/dlt/destinations/impl/lancedb/schema.py b/dlt/destinations/impl/lancedb/schema.py new file mode 100644 index 0000000000..c7cceec274 --- /dev/null +++ b/dlt/destinations/impl/lancedb/schema.py @@ -0,0 +1,84 @@ +"""Utilities for creating arrow schemas from table schemas.""" + +from dlt.common.json import json +from typing import ( + List, + cast, + Optional, +) + +import pyarrow as pa +from lancedb.embeddings import TextEmbeddingFunction # type: ignore +from typing_extensions import TypeAlias + +from dlt.common.schema import Schema, TColumnSchema +from dlt.common.typing import DictStrAny +from dlt.destinations.type_mapping import TypeMapper + + +TArrowSchema: TypeAlias = pa.Schema +TArrowDataType: TypeAlias = pa.DataType +TArrowField: TypeAlias = pa.Field +NULL_SCHEMA: TArrowSchema = pa.schema([]) +"""Empty pyarrow Schema with no fields.""" + + +def arrow_schema_to_dict(schema: TArrowSchema) -> DictStrAny: + return {field.name: field.type for field in schema} + + +def make_arrow_field_schema( + column_name: str, + column: TColumnSchema, + type_mapper: TypeMapper, +) -> TArrowField: + """Creates a PyArrow field from a dlt column schema.""" + dtype = cast(TArrowDataType, type_mapper.to_db_type(column)) + return pa.field(column_name, dtype) + + +def make_arrow_table_schema( + table_name: str, + schema: Schema, + type_mapper: TypeMapper, + id_field_name: Optional[str] = None, + vector_field_name: Optional[str] = None, + embedding_fields: Optional[List[str]] = None, + embedding_model_func: Optional[TextEmbeddingFunction] = None, + embedding_model_dimensions: Optional[int] = None, +) -> TArrowSchema: + """Creates a PyArrow schema from a dlt schema.""" + arrow_schema: List[TArrowField] = [] + + if id_field_name: + arrow_schema.append(pa.field(id_field_name, pa.string())) + + if embedding_fields: + # User's provided dimension config, if provided, takes precedence. + vec_size = embedding_model_dimensions or embedding_model_func.ndims() + arrow_schema.append(pa.field(vector_field_name, pa.list_(pa.float32(), vec_size))) + + for column_name, column in schema.get_table_columns(table_name).items(): + field = make_arrow_field_schema(column_name, column, type_mapper) + arrow_schema.append(field) + + metadata = {} + if embedding_model_func: + # Get the registered alias if it exists, otherwise use the class name. + name = getattr( + embedding_model_func, + "__embedding_function_registry_alias__", + embedding_model_func.__class__.__name__, + ) + embedding_functions = [ + { + "source_column": source_column, + "vector_column": vector_field_name, + "name": name, + "model": embedding_model_func.safe_model_dump(), + } + for source_column in embedding_fields + ] + metadata["embedding_functions"] = json.dumps(embedding_functions).encode("utf-8") + + return pa.schema(arrow_schema, metadata=metadata) diff --git a/dlt/destinations/impl/lancedb/utils.py b/dlt/destinations/impl/lancedb/utils.py new file mode 100644 index 0000000000..aeacd4d34b --- /dev/null +++ b/dlt/destinations/impl/lancedb/utils.py @@ -0,0 +1,55 @@ +import os +import uuid +from typing import Sequence, Union, Dict + +from dlt.common.schema import TTableSchema +from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.typing import DictStrAny +from dlt.destinations.impl.lancedb.configuration import TEmbeddingProvider + + +PROVIDER_ENVIRONMENT_VARIABLES_MAP: Dict[TEmbeddingProvider, str] = { + "cohere": "COHERE_API_KEY", + "gemini-text": "GOOGLE_API_KEY", + "openai": "OPENAI_API_KEY", + "huggingface": "HUGGINGFACE_API_KEY", +} + + +def generate_uuid(data: DictStrAny, unique_identifiers: Sequence[str], table_name: str) -> str: + """Generates deterministic UUID - used for deduplication. + + Args: + data (Dict[str, Any]): Arbitrary data to generate UUID for. + unique_identifiers (Sequence[str]): A list of unique identifiers. + table_name (str): LanceDB table name. + + Returns: + str: A string representation of the generated UUID. + """ + data_id = "_".join(str(data[key]) for key in unique_identifiers) + return str(uuid.uuid5(uuid.NAMESPACE_DNS, table_name + data_id)) + + +def list_merge_identifiers(table_schema: TTableSchema) -> Sequence[str]: + """Returns a list of merge keys for a table used for either merging or deduplication. + + Args: + table_schema (TTableSchema): a dlt table schema. + + Returns: + Sequence[str]: A list of unique column identifiers. + """ + if table_schema.get("write_disposition") == "merge": + primary_keys = get_columns_names_with_prop(table_schema, "primary_key") + merge_keys = get_columns_names_with_prop(table_schema, "merge_key") + if join_keys := list(set(primary_keys + merge_keys)): + return join_keys + return get_columns_names_with_prop(table_schema, "unique") + + +def set_non_standard_providers_environment_variables( + embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None] +) -> None: + if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP: + os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or "" diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md new file mode 100644 index 0000000000..dbf90da4b9 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md @@ -0,0 +1,211 @@ +--- +title: LanceDB +description: LanceDB is an open source vector database that can be used as a destination in dlt. +keywords: [ lancedb, vector database, destination, dlt ] +--- + +# LanceDB + +[LanceDB](https://lancedb.com/) is an open-source, high-performance vector database. It allows you to store data objects and perform similarity searches over them. +This destination helps you load data into LanceDB from [dlt resources](../../general-usage/resource.md). + +## Setup Guide + +### Choosing a Model Provider + +First, you need to decide which embedding model provider to use. You can find all supported providers by visiting the official [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). + + +### Install dlt with LanceDB + +To use LanceDB as a destination, make sure `dlt` is installed with the `lancedb` extra: + +```sh +pip install "dlt[lancedb]" +``` + +the lancedb extra only installs `dlt` and `lancedb`. You will need to install your model provider's SDK. + +You can find which libraries you need to also referring to the [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). + +### Configure the destination + +Configure the destination in the dlt secrets file located at `~/.dlt/secrets.toml` by default. Add the following section: + +```toml +[destination.lancedb] +embedding_model_provider = "cohere" +embedding_model = "embed-english-v3.0" +[destination.lancedb.credentials] +uri = ".lancedb" +api_key = "api_key" # API key to connect to LanceDB Cloud. Leave out if you are using LanceDB OSS. +embedding_model_provider_api_key = "embedding_model_provider_api_key" # Not needed for providers that don't need authentication (ollama, sentence-transformers). +``` + +- The `uri` specifies the location of your LanceDB instance. It defaults to a local, on-disk instance if not provided. +- The `api_key` is your api key for LanceDB Cloud connections. If you're using LanceDB OSS, you don't need to supply this key. +- The `embedding_model_provider` specifies the embedding provider used for generating embeddings. The default is `cohere`. +- The `embedding_model` specifies the model used by the embedding provider for generating embeddings. + Check with the embedding provider which options are available. + Reference https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/. +- The `embedding_model_provider_api_key` is the API key for the embedding model provider used to generate embeddings. If you're using a provider that doesn't need authentication, say ollama, you don't need to supply this key. + +:::info Available Model Providers +- "gemini-text" +- "bedrock-text" +- "cohere" +- "gte-text" +- "imagebind" +- "instructor" +- "open-clip" +- "openai" +- "sentence-transformers" +- "huggingface" +- "colbert" +::: + +### Define your data source + +For example: + +```py +import dlt +from dlt.destinations.adapters import lancedb_adapter + + +movies = [ + { + "id": 1, + "title": "Blade Runner", + "year": 1982, + }, + { + "id": 2, + "title": "Ghost in the Shell", + "year": 1995, + }, + { + "id": 3, + "title": "The Matrix", + "year": 1999, + }, +] +``` + +### Create a pipeline: + +```py +pipeline = dlt.pipeline( + pipeline_name="movies", + destination="lancedb", + dataset_name="MoviesDataset", +) +``` + +### Run the pipeline: + +```py +info = pipeline.run( + lancedb_adapter( + movies, + embed="title", + ) +) +``` + +The data is now loaded into LanceDB. + +To use **vector search** after loading, you **must specify which fields LanceDB should generate embeddings for**. Do this by wrapping the data (or dlt resource) with the **`lancedb_adapter`** +function. + +## Using an Adapter to Specify Columns to Vectorise + +Out of the box, LanceDB will act as a normal database. To use LanceDB's embedding facilities, you'll need to specify which fields you'd like to embed in your dlt resource. + +The `lancedb_adapter` is a helper function that configures the resource for the LanceDB destination: + +```py +lancedb_adapter(data, embed) +``` + +It accepts the following arguments: + +- `data`: a dlt resource object, or a Python data structure (e.g. a list of dictionaries). +- `embed`: a name of the field or a list of names to generate embeddings for. + +Returns: [dlt resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. + +Example: + +```py +lancedb_adapter( + resource, + embed=["title", "description"], +) +``` + +Bear in mind that you can't use an adapter on a [dlt source](../../general-usage/source.md), only a [dlt resource](../../general-usage/resource.md). + +## Write disposition + +All [write dispositions](../../general-usage/incremental-loading.md#choosing-a-write-disposition) are supported by the LanceDB destination. + +### Replace + +The [replace](../../general-usage/full-loading.md) disposition replaces the data in the destination with the data from the resource. + +```py +info = pipeline.run( + lancedb_adapter( + movies, + embed="title", + ), + write_disposition="replace", +) +``` + +### Merge + +The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier. + +```py +pipeline.run( + lancedb_adapter( + movies, + embed="title", + ), + write_disposition="merge", + primary_key="id", +) +``` + +### Append + +This is the default disposition. It will append the data to the existing data in the destination. + +## Additional Destination Options + +- `dataset_separator`: The character used to separate the dataset name from table names. Defaults to "___". +- `vector_field_name`: The name of the special field to store vector embeddings. Defaults to "vector__". +- `id_field_name`: The name of the special field used for deduplication and merging. Defaults to "id__". +- `max_retries`: The maximum number of retries for embedding operations. Set to 0 to disable retries. Defaults to 3. + + +## dbt support + +The LanceDB destination doesn't support dbt integration. + +## Syncing of `dlt` state + +The LanceDB destination supports syncing of the `dlt` state. + +## Current Limitations + +Adding new fields to an existing LanceDB table requires loading the entire table data into memory as a PyArrow table. +This is because PyArrow tables are immutable, so adding fields requires creating a new table with the updated schema. + +For huge tables, this may impact performance and memory usage since the full table must be loaded into memory to add the new fields. +Keep these considerations in mind when working with large datasets and monitor memory usage if adding fields to sizable existing tables. + + + diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 1ea92f2e91..4fa1c58eae 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -116,6 +116,7 @@ const sidebars = { 'dlt-ecosystem/destinations/snowflake', 'dlt-ecosystem/destinations/athena', 'dlt-ecosystem/destinations/weaviate', + 'dlt-ecosystem/destinations/lancedb', 'dlt-ecosystem/destinations/qdrant', 'dlt-ecosystem/destinations/dremio', 'dlt-ecosystem/destinations/destination', diff --git a/poetry.lock b/poetry.lock index 894f5868bc..1543d079c2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -3552,164 +3552,6 @@ files = [ {file = "google_re2-1.1-1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c6c9f64b9724ec38da8e514f404ac64e9a6a5e8b1d7031c2dadd05c1f4c16fd"}, {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5eaefe4705b75ca5f78178a50104b689e9282f868e12f119b26b4cffc0c7ee6e"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:e35f2c8aabfaaa4ce6420b3cae86c0c29042b1b4f9937254347e9b985694a171"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:35fd189cbaaaa39c9a6a8a00164c8d9c709bacd0c231c694936879609beff516"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:60475d222cebd066c80414831c8a42aa2449aab252084102ee05440896586e6a"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:871cb85b9b0e1784c983b5c148156b3c5314cb29ca70432dff0d163c5c08d7e5"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94f4e66e34bdb8de91ec6cdf20ba4fa9fea1dfdcfb77ff1f59700d01a0243664"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1563577e2b720d267c4cffacc0f6a2b5c8480ea966ebdb1844fbea6602c7496f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49b7964532a801b96062d78c0222d155873968f823a546a3dbe63d73f25bb56f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2362fd70eb639a75fd0187d28b4ba7b20b3088833d8ad7ffd8693d0ba159e1c2"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86b80719636a4e21391e20a9adf18173ee6ae2ec956726fe2ff587417b5e8ba6"}, - {file = "google_re2-1.1-2-cp310-cp310-win32.whl", hash = "sha256:5456fba09df951fe8d1714474ed1ecda102a68ddffab0113e6c117d2e64e6f2b"}, - {file = "google_re2-1.1-2-cp310-cp310-win_amd64.whl", hash = "sha256:2ac6936a3a60d8d9de9563e90227b3aea27068f597274ca192c999a12d8baa8f"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5a87b436028ec9b0f02fe19d4cbc19ef30441085cdfcdf1cce8fbe5c4bd5e9a"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:fc0d4163de9ed2155a77e7a2d59d94c348a6bbab3cff88922fab9e0d3d24faec"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:48b12d953bc796736e7831d67b36892fb6419a4cc44cb16521fe291e594bfe23"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:62c780c927cff98c1538439f0ff616f48a9b2e8837c676f53170d8ae5b9e83cb"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:04b2aefd768aa4edeef8b273327806c9cb0b82e90ff52eacf5d11003ac7a0db2"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c90175992346519ee7546d9af9a64541c05b6b70346b0ddc54a48aa0d3b6554"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22ad9ad9d125249d6386a2e80efb9de7af8260b703b6be7fa0ab069c1cf56ced"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70971f6ffe5254e476e71d449089917f50ebf9cf60f9cec80975ab1693777e2"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f267499529e64a4abed24c588f355ebe4700189d434d84a7367725f5a186e48d"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b632eff5e4cd44545a9c0e52f2e1becd55831e25f4dd4e0d7ec8ee6ca50858c1"}, - {file = "google_re2-1.1-2-cp311-cp311-win32.whl", hash = "sha256:a42c733036e8f242ee4e5f0e27153ad4ca44ced9e4ce82f3972938ddee528db0"}, - {file = "google_re2-1.1-2-cp311-cp311-win_amd64.whl", hash = "sha256:64f8eed4ca96905d99b5286b3d14b5ca4f6a025ff3c1351626a7df2f93ad1ddd"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5541efcca5b5faf7e0d882334a04fa479bad4e7433f94870f46272eec0672c4a"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92309af35b6eb2d3b3dc57045cdd83a76370958ab3e0edd2cc4638f6d23f5b32"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:197cd9bcaba96d18c5bf84d0c32fca7a26c234ea83b1d3083366f4392cb99f78"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:1b896f171d29b541256cf26e10dccc9103ac1894683914ed88828ca6facf8dca"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:e022d3239b945014e916ca7120fee659b246ec26c301f9e0542f1a19b38a8744"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:2c73f8a9440873b68bee1198094377501065e85aaf6fcc0d2512c7589ffa06ca"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:901d86555bd7725506d651afaba7d71cd4abd13260aed6cfd7c641a45f76d4f6"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce4710ff636701cfb56eb91c19b775d53b03749a23b7d2a5071bbbf4342a9067"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76a20e5ebdf5bc5d430530197e42a2eeb562f729d3a3fb51f39168283d676e66"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77c9f4d4bb1c8de9d2642d3c4b8b615858ba764df025b3b4f1310266f8def269"}, - {file = "google_re2-1.1-2-cp38-cp38-win32.whl", hash = "sha256:94bd60785bf37ef130a1613738e3c39465a67eae3f3be44bb918540d39b68da3"}, - {file = "google_re2-1.1-2-cp38-cp38-win_amd64.whl", hash = "sha256:59efeb77c0dcdbe37794c61f29c5b1f34bc06e8ec309a111ccdd29d380644d70"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:221e38c27e1dd9ccb8e911e9c7aed6439f68ce81e7bb74001076830b0d6e931d"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d9145879e6c2e1b814445300b31f88a675e1f06c57564670d95a1442e8370c27"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c8a12f0740e2a52826bdbf95569a4b0abdf413b4012fa71e94ad25dd4715c6e5"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9c9998f71466f4db7bda752aa7c348b2881ff688e361108fe500caad1d8b9cb2"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:0c39f69b702005963a3d3bf78743e1733ad73efd7e6e8465d76e3009e4694ceb"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:6d0ce762dee8d6617d0b1788a9653e805e83a23046c441d0ea65f1e27bf84114"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ecf3619d98c9b4a7844ab52552ad32597cdbc9a5bdbc7e3435391c653600d1e2"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a1426a8cbd1fa004974574708d496005bd379310c4b1c7012be4bc75efde7a8"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a30626ba48b4070f3eab272d860ef1952e710b088792c4d68dddb155be6bfc"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b9c1ffcfbc3095b6ff601ec2d2bf662988f6ea6763bc1c9d52bec55881f8fde"}, - {file = "google_re2-1.1-2-cp39-cp39-win32.whl", hash = "sha256:32ecf995a252c0548404c1065ba4b36f1e524f1f4a86b6367a1a6c3da3801e30"}, - {file = "google_re2-1.1-2-cp39-cp39-win_amd64.whl", hash = "sha256:e7865410f3b112a3609739283ec3f4f6f25aae827ff59c6bfdf806fd394d753e"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b21f83f0a201009c56f06fcc7294a33555ede97130e8a91b3f4cae01aed1d73"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b38194b91354a38db1f86f25d09cdc6ac85d63aee4c67b43da3048ce637adf45"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e7da3da8d6b5a18d6c3b61b11cc5b66b8564eaedce99d2312b15b6487730fc76"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:aeca656fb10d8638f245331aabab59c9e7e051ca974b366dd79e6a9efb12e401"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:2069d6dc94f5fa14a159bf99cad2f11e9c0f8ec3b7f44a4dde9e59afe5d1c786"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:2319a39305a4931cb5251451f2582713418a19bef2af7adf9e2a7a0edd939b99"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb98fc131699756c6d86246f670a5e1c1cc1ba85413c425ad344cb30479b246c"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6e038986d8ffe4e269f8532f03009f229d1f6018d4ac0dabc8aff876338f6e0"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8618343ee658310e0f53bf586fab7409de43ce82bf8d9f7eb119536adc9783fd"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8140ca861cfe00602319cefe2c7b8737b379eb07fb328b51dc44584f47a2718"}, - {file = "google_re2-1.1-3-cp310-cp310-win32.whl", hash = "sha256:41f439c5c54e8a3a0a1fa2dbd1e809d3f643f862df7b16dd790f36a1238a272e"}, - {file = "google_re2-1.1-3-cp310-cp310-win_amd64.whl", hash = "sha256:fe20e97a33176d96d3e4b5b401de35182b9505823abea51425ec011f53ef5e56"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c39ff52b1765db039f690ee5b7b23919d8535aae94db7996079fbde0098c4d7"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5420be674fd164041639ba4c825450f3d4bd635572acdde16b3dcd697f8aa3ef"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ff53881cf1ce040f102a42d39db93c3f835f522337ae9c79839a842f26d97733"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:8d04600b0b53523118df2e413a71417c408f20dee640bf07dfab601c96a18a77"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:c4835d4849faa34a7fa1074098d81c420ed6c0707a3772482b02ce14f2a7c007"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:3309a9b81251d35fee15974d0ae0581a9a375266deeafdc3a3ac0d172a742357"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2b51cafee7e0bc72d0a4a454547bd8f257cde412ac9f1a2dc46a203b5e42cf4"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:83f5f1cb52f832c2297d271ee8c56cf5e9053448162e5d2223d513f729bad908"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55865a1ace92be3f7953b2e2b38b901d8074a367aa491daee43260a53a7fc6f0"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cec2167dd142e583e98c783bd0d28b8cf5a9cdbe1f7407ba4163fe3ccb613cb9"}, - {file = "google_re2-1.1-3-cp311-cp311-win32.whl", hash = "sha256:a0bc1fe96849e4eb8b726d0bba493f5b989372243b32fe20729cace02e5a214d"}, - {file = "google_re2-1.1-3-cp311-cp311-win_amd64.whl", hash = "sha256:e6310a156db96fc5957cb007dd2feb18476898654530683897469447df73a7cd"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e63cd10ea006088b320e8c5d308da1f6c87aa95138a71c60dd7ca1c8e91927e"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:12b566830a334178733a85e416b1e0507dbc0ceb322827616fe51ef56c5154f1"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:442e18c9d46b225c1496919c16eafe8f8d9bb4091b00b4d3440da03c55bbf4ed"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:c54c00263a9c39b2dacd93e9636319af51e3cf885c080b9680a9631708326460"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:15a3caeeb327bc22e0c9f95eb76890fec8874cacccd2b01ff5c080ab4819bbec"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:59ec0d2cced77f715d41f6eafd901f6b15c11e28ba25fe0effdc1de554d78e75"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:185bf0e3441aed3840590f8e42f916e2920d235eb14df2cbc2049526803d3e71"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:586d3f2014eea5be14d8de53374d9b79fa99689160e00efa64b5fe93af326087"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc2575082de4ffd234d9607f3ae67ca22b15a1a88793240e2045f3b3a36a5795"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59c5ad438eddb3630def394456091284d7bbc5b89351987f94f3792d296d1f96"}, - {file = "google_re2-1.1-3-cp312-cp312-win32.whl", hash = "sha256:5b9878c53f2bf16f75bf71d4ddd57f6611351408d5821040e91c53ebdf82c373"}, - {file = "google_re2-1.1-3-cp312-cp312-win_amd64.whl", hash = "sha256:4fdecfeb213110d0a85bad335a8e7cdb59fea7de81a4fe659233f487171980f9"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2dd87bacab32b709c28d0145fe75a956b6a39e28f0726d867375dba5721c76c1"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:55d24c61fe35dddc1bb484593a57c9f60f9e66d7f31f091ef9608ed0b6dde79f"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a0cf1180d908622df648c26b0cd09281f92129805ccc56a39227fdbfeab95cb4"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:09586f07f3f88d432265c75976da1c619ab7192cd7ebdf53f4ae0776c19e4b56"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:539f1b053402203576e919a06749198da4ae415931ee28948a1898131ae932ce"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:abf0bcb5365b0e27a5a23f3da403dffdbbac2c0e3a3f1535a8b10cc121b5d5fb"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:19c83e5bbed7958213eeac3aa71c506525ce54faf03e07d0b96cd0a764890511"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3348e77330ff672dc44ec01894fa5d93c409a532b6d688feac55e714e9059920"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06b63edb57c5ce5a13eabfd71155e346b9477dc8906dec7c580d4f70c16a7e0d"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12fe57ba2914092b83338d61d8def9ebd5a2bd0fd8679eceb5d4c2748105d5c0"}, - {file = "google_re2-1.1-3-cp38-cp38-win32.whl", hash = "sha256:80796e08d24e606e675019fe8de4eb5c94bb765be13c384f2695247d54a6df75"}, - {file = "google_re2-1.1-3-cp38-cp38-win_amd64.whl", hash = "sha256:3c2257dedfe7cc5deb6791e563af9e071a9d414dad89e37ac7ad22f91be171a9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43a0cd77c87c894f28969ac622f94b2e6d1571261dfdd785026848a25cfdc9b9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1038990b77fd66f279bd66a0832b67435ea925e15bb59eafc7b60fdec812b616"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fb5dda6875d18dd45f0f24ebced6d1f7388867c8fb04a235d1deab7ea479ce38"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb1d164965c6d57a351b421d2f77c051403766a8b75aaa602324ee2451fff77f"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a072ebfa495051d07ffecbf6ce21eb84793568d5c3c678c00ed8ff6b8066ab31"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4eb66c8398c8a510adc97978d944b3b29c91181237218841ea1a91dc39ec0e54"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f7c8b57b1f559553248d1757b7fa5b2e0cc845666738d155dff1987c2618264e"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9162f6aa4f25453c682eb176f21b8e2f40205be9f667e98a54b3e1ff10d6ee75"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2d65ddf67fd7bf94705626871d463057d3d9a3538d41022f95b9d8f01df36e1"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, - {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, - {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, - {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, - {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, - {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, - {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, - {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, - {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, - {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, - {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, - {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, - {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, ] [[package]] @@ -4486,6 +4328,42 @@ docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +[[package]] +name = "lancedb" +version = "0.9.0" +description = "lancedb" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"}, + {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"}, + {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"}, +] + +[package.dependencies] +attrs = ">=21.3.0" +cachetools = "*" +deprecation = "*" +overrides = ">=0.7" +packaging = "*" +pydantic = ">=1.10" +pylance = "0.13.0" +ratelimiter = ">=1.0,<2.0" +requests = ">=2.31.0" +retry = ">=0.9.2" +tqdm = ">=4.27.0" + +[package.extras] +azure = ["adlfs (>=2024.2.0)"] +clip = ["open-clip", "pillow", "torch"] +dev = ["pre-commit", "ruff"] +docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] + [[package]] name = "lazy-object-proxy" version = "1.9.0" @@ -4662,13 +4540,10 @@ files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, - {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, - {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, - {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -4677,7 +4552,6 @@ files = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, - {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -4697,7 +4571,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, - {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -4707,7 +4580,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, - {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -4717,7 +4589,6 @@ files = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, - {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -4727,7 +4598,6 @@ files = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, - {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -4738,16 +4608,13 @@ files = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, - {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, - {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, - {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, @@ -4908,16 +4775,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -6916,6 +6773,32 @@ ray = ["ray[data]"] tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] torch = ["torch"] +[[package]] +name = "pylance" +version = "0.13.0" +description = "python wrapper for Lance columnar format" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"}, + {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"}, + {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"}, +] + +[package.dependencies] +numpy = ">=1.22" +pyarrow = ">=12,<15.0.1" + +[package.extras] +benchmarks = ["pytest-benchmark"] +dev = ["ruff (==0.4.1)"] +ray = ["ray[data]"] +tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] +torch = ["torch"] + [[package]] name = "pymongo" version = "4.6.0" @@ -6953,7 +6836,6 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, - {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -7394,7 +7276,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -7402,16 +7283,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -7428,7 +7301,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -7436,7 +7308,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -8360,7 +8231,6 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -8370,35 +8240,26 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -9569,6 +9430,7 @@ duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] gs = ["gcsfs"] +lancedb = ["lancedb", "pyarrow"] motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] @@ -9583,4 +9445,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "920869be38d3b82c2c62e77d814c43be62b71ed73d68d1c570ac70886d439b91" +content-hash = "9979b133732a91a49fe3014afde0f5e3455cbc26a129aecad6171672c3f0f4a9" diff --git a/pyproject.toml b/pyproject.toml index c7cda5a994..cd4d6a78da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } +lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'" } deltalake = { version = ">=0.17.4", optional = true } [tool.poetry.extras] @@ -103,8 +104,10 @@ qdrant = ["qdrant-client"] databricks = ["databricks-sql-connector"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] +lancedb = ["lancedb", "pyarrow"] deltalake = ["deltalake", "pyarrow"] + [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" diff --git a/tests/load/lancedb/__init__.py b/tests/load/lancedb/__init__.py new file mode 100644 index 0000000000..fb4bf0b35d --- /dev/null +++ b/tests/load/lancedb/__init__.py @@ -0,0 +1,3 @@ +from tests.utils import skip_if_not_active + +skip_if_not_active("lancedb") diff --git a/tests/load/lancedb/test_config.py b/tests/load/lancedb/test_config.py new file mode 100644 index 0000000000..c1d658d4fe --- /dev/null +++ b/tests/load/lancedb/test_config.py @@ -0,0 +1,35 @@ +import os +from typing import Iterator + +import pytest + +from dlt.common.configuration import resolve_configuration +from dlt.common.utils import digest128 +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBClientConfiguration, +) +from tests.load.utils import ( + drop_active_pipeline_data, +) + + +# Mark all tests as essential, do not remove. +pytestmark = pytest.mark.essential + + +@pytest.fixture(autouse=True) +def drop_lancedb_data() -> Iterator[None]: + yield + drop_active_pipeline_data() + + +def test_lancedb_configuration() -> None: + os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "colbert" + os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "text-embedding-3-small" + + config = resolve_configuration( + LanceDBClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "lancedb"), + ) + assert config.embedding_model_provider == "colbert" + assert config.embedding_model == "text-embedding-3-small" diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py new file mode 100644 index 0000000000..a89153f629 --- /dev/null +++ b/tests/load/lancedb/test_pipeline.py @@ -0,0 +1,435 @@ +from typing import Iterator, Generator, Any, List + +import pytest + +import dlt +from dlt.common import json +from dlt.common.typing import DictStrStr, DictStrAny +from dlt.common.utils import uniq_id +from dlt.destinations.impl.lancedb.lancedb_adapter import ( + lancedb_adapter, + VECTORIZE_HINT, +) +from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient +from tests.load.lancedb.utils import assert_table +from tests.load.utils import sequence_generator, drop_active_pipeline_data +from tests.pipeline.utils import assert_load_info + + +# Mark all tests as essential, do not remove. +pytestmark = pytest.mark.essential + + +@pytest.fixture(autouse=True) +def drop_lancedb_data() -> Iterator[None]: + yield + drop_active_pipeline_data() + + +def test_adapter_and_hints() -> None: + generator_instance1 = sequence_generator() + + @dlt.resource(columns=[{"name": "content", "data_type": "text"}]) + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + assert some_data.columns["content"] == {"name": "content", "data_type": "text"} # type: ignore[index] + + lancedb_adapter( + some_data, + embed=["content"], + ) + + assert some_data.columns["content"] == { # type: ignore + "name": "content", + "data_type": "text", + "x-lancedb-embed": True, + } + + +def test_basic_state_and_schema() -> None: + generator_instance1 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"test_pipeline_append_dataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + client: LanceDBClient + with pipeline.destination_client() as client: # type: ignore + # Check if we can get a stored schema and state. + schema = client.get_stored_schema() + print("Print dataset name", client.dataset_name) + assert schema + state = client.get_stored_state("test_pipeline_append") + assert state + + +def test_pipeline_append() -> None: + generator_instance1 = sequence_generator() + generator_instance2 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + data.extend(next(generator_instance2)) + assert_table(pipeline, "some_data", items=data) + + +def test_explicit_append() -> None: + """Append should work even when the primary key is specified.""" + data = [ + {"doc_id": 1, "content": "1"}, + {"doc_id": 2, "content": "2"}, + {"doc_id": 3, "content": "3"}, + ] + + @dlt.resource(primary_key="doc_id") + def some_data() -> Generator[List[DictStrAny], Any, None]: + yield data + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + write_disposition="append", + ) + assert_load_info(info) + + data.extend(data) + assert_table(pipeline, "some_data", items=data) + + +def test_pipeline_replace() -> None: + generator_instance1 = sequence_generator() + generator_instance2 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + uid = uniq_id() + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_replace", + destination="lancedb", + dataset_name="test_pipeline_replace_dataset" + + uid, # lancedb doesn't mandate any name normalization + ) + + info = pipeline.run( + some_data(), + write_disposition="replace", + ) + assert_load_info(info) + assert info.dataset_name == f"test_pipeline_replace_dataset{uid}" + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + write_disposition="replace", + ) + assert_load_info(info) + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + +def test_pipeline_merge() -> None: + data = [ + { + "doc_id": 1, + "merge_id": "shawshank-redemption-1994", + "title": "The Shawshank Redemption", + "description": ( + "Two imprisoned men find redemption through acts of decency over the years." + ), + }, + { + "doc_id": 2, + "merge_id": "the-godfather-1972", + "title": "The Godfather", + "description": ( + "A crime dynasty's aging patriarch transfers control to his reluctant son." + ), + }, + { + "doc_id": 3, + "merge_id": "the-dark-knight-2008", + "title": "The Dark Knight", + "description": ( + "The Joker wreaks havoc on Gotham, challenging The Dark Knight's ability to fight" + " injustice." + ), + }, + { + "doc_id": 4, + "merge_id": "pulp-fiction-1994", + "title": "Pulp Fiction", + "description": ( + "The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner" + " bandits intertwine in four tales of violence and redemption." + ), + }, + { + "doc_id": 5, + "merge_id": "schindlers-list-1993", + "title": "Schindler's List", + "description": ( + "In German-occupied Poland during World War II, industrialist Oskar Schindler" + " gradually becomes concerned for his Jewish workforce after witnessing their" + " persecution by the Nazis." + ), + }, + { + "doc_id": 6, + "merge_id": "the-lord-of-the-rings-the-return-of-the-king-2003", + "title": "The Lord of the Rings: The Return of the King", + "description": ( + "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze" + " from Frodo and Sam as they approach Mount Doom with the One Ring." + ), + }, + { + "doc_id": 7, + "merge_id": "the-matrix-1999", + "title": "The Matrix", + "description": ( + "A computer hacker learns from mysterious rebels about the true nature of his" + " reality and his role in the war against its controllers." + ), + }, + ] + + @dlt.resource(primary_key="doc_id") + def movies_data() -> Any: + yield data + + @dlt.resource(primary_key="doc_id", merge_key=["merge_id", "title"]) + def movies_data_explicit_merge_keys() -> Any: + yield data + + lancedb_adapter( + movies_data, + embed=["description"], + ) + + lancedb_adapter( + movies_data_explicit_merge_keys, + embed=["description"], + ) + + pipeline = dlt.pipeline( + pipeline_name="movies", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + movies_data(), + write_disposition="merge", + dataset_name=f"MoviesDataset{uniq_id()}", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + # Change some data. + data[0]["title"] = "The Shawshank Redemption 2" + + info = pipeline.run( + movies_data(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + info = pipeline.run( + movies_data(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + # Test with explicit merge keys. + info = pipeline.run( + movies_data_explicit_merge_keys(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data_explicit_merge_keys", items=data) + + +def test_pipeline_with_schema_evolution() -> None: + data = [ + { + "doc_id": 1, + "content": "1", + }, + { + "doc_id": 2, + "content": "2", + }, + ] + + @dlt.resource() + def some_data() -> Generator[List[DictStrAny], Any, None]: + yield data + + lancedb_adapter(some_data, embed=["content"]) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestSchemaEvolutionDataset{uniq_id()}", + ) + pipeline.run( + some_data(), + ) + + assert_table(pipeline, "some_data", items=data) + + aggregated_data = data.copy() + + data = [ + { + "doc_id": 3, + "content": "3", + "new_column": "new", + }, + { + "doc_id": 4, + "content": "4", + "new_column": "new", + }, + ] + + pipeline.run( + some_data(), + ) + + table_schema = pipeline.default_schema.tables["some_data"] + assert "new_column" in table_schema["columns"] + + aggregated_data.extend(data) + + assert_table(pipeline, "some_data", items=aggregated_data) + + +def test_merge_github_nested() -> None: + pipe = dlt.pipeline(destination="lancedb", dataset_name="github1", full_refresh=True) + assert pipe.dataset_name.startswith("github1_202") + + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", + "r", + encoding="utf-8", + ) as f: + data = json.load(f) + + info = pipe.run( + lancedb_adapter(data[:17], embed=["title", "body"]), + table_name="issues", + write_disposition="merge", + primary_key="id", + ) + assert_load_info(info) + # assert if schema contains tables with right names + print(pipe.default_schema.tables.keys()) + assert set(pipe.default_schema.tables.keys()) == { + "_dlt_version", + "_dlt_loads", + "issues", + "_dlt_pipeline_state", + "issues__labels", + "issues__assignees", + } + assert {t["name"] for t in pipe.default_schema.data_tables()} == { + "issues", + "issues__labels", + "issues__assignees", + } + assert {t["name"] for t in pipe.default_schema.dlt_tables()} == { + "_dlt_version", + "_dlt_loads", + "_dlt_pipeline_state", + } + issues = pipe.default_schema.tables["issues"] + assert issues["columns"]["id"]["primary_key"] is True + # Make sure vectorization is enabled for. + assert issues["columns"]["title"][VECTORIZE_HINT] # type: ignore[literal-required] + assert issues["columns"]["body"][VECTORIZE_HINT] # type: ignore[literal-required] + assert VECTORIZE_HINT not in issues["columns"]["url"] + assert_table(pipe, "issues", expected_items_count=17) + + +def test_empty_dataset_allowed() -> None: + # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed. + pipe = dlt.pipeline(destination="lancedb", full_refresh=True) + client: LanceDBClient = pipe.destination_client() # type: ignore[assignment] + + assert pipe.dataset_name is None + info = pipe.run(lancedb_adapter(["context", "created", "not a stop word"], embed=["value"])) + # Dataset in load info is empty. + assert info.dataset_name is None + client = pipe.destination_client() # type: ignore[assignment] + assert client.dataset_name is None + assert client.sentinel_table == "dltSentinelTable" + assert_table(pipe, "content", expected_items_count=3) diff --git a/tests/load/lancedb/utils.py b/tests/load/lancedb/utils.py new file mode 100644 index 0000000000..dc3ea5304b --- /dev/null +++ b/tests/load/lancedb/utils.py @@ -0,0 +1,74 @@ +from typing import Union, List, Any, Dict + +import numpy as np +from lancedb.embeddings import TextEmbeddingFunction # type: ignore + +import dlt +from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + +def assert_unordered_dicts_equal( + dict_list1: List[Dict[str, Any]], dict_list2: List[Dict[str, Any]] +) -> None: + """ + Assert that two lists of dictionaries contain the same dictionaries, ignoring None values. + + Args: + dict_list1 (List[Dict[str, Any]]): The first list of dictionaries to compare. + dict_list2 (List[Dict[str, Any]]): The second list of dictionaries to compare. + + Raises: + AssertionError: If the lists have different lengths or contain different dictionaries. + """ + assert len(dict_list1) == len(dict_list2), "Lists have different length" + + dict_set1 = {tuple(sorted((k, v) for k, v in d.items() if v is not None)) for d in dict_list1} + dict_set2 = {tuple(sorted((k, v) for k, v in d.items() if v is not None)) for d in dict_list2} + + assert dict_set1 == dict_set2, "Lists contain different dictionaries" + + +def assert_table( + pipeline: dlt.Pipeline, + table_name: str, + expected_items_count: int = None, + items: List[Any] = None, +) -> None: + client: LanceDBClient = pipeline.destination_client() # type: ignore[assignment] + qualified_table_name = client.make_qualified_table_name(table_name) + + exists = client.table_exists(qualified_table_name) + assert exists + + records = client.db_client.open_table(qualified_table_name).search().limit(50).to_list() + + if expected_items_count is not None: + assert expected_items_count == len(records) + + if items is None: + return + + drop_keys = [ + "_dlt_id", + "_dlt_load_id", + dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__", + dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector__", + ] + objects_without_dlt_or_special_keys = [ + {k: v for k, v in record.items() if k not in drop_keys} for record in records + ] + + assert_unordered_dicts_equal(objects_without_dlt_or_special_keys, items) + + +class MockEmbeddingFunc(TextEmbeddingFunction): + def generate_embeddings( + self, + texts: Union[List[str], np.ndarray], # type: ignore[type-arg] + *args, + **kwargs, + ) -> List[np.ndarray]: # type: ignore[type-arg] + return [np.array(None)] + + def ndims(self) -> int: + return 2 diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index e0bd1fff97..e0cb9dab84 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -319,8 +319,8 @@ def test_merge_github_nested() -> None: primary_key="id", ) assert_load_info(info) + # assert if schema contains tables with right names - print(p.default_schema.tables.keys()) assert set(p.default_schema.tables.keys()) == { "_dlt_version", "_dlt_loads", diff --git a/tests/load/utils.py b/tests/load/utils.py index 8c6446b921..00ed4e3bf3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -276,8 +276,10 @@ def destinations_configs( assert set(SQL_DESTINATIONS) == {d.destination for d in destination_configs} if default_vector_configs: - # for now only weaviate - destination_configs += [DestinationTestConfiguration(destination="weaviate")] + destination_configs += [ + DestinationTestConfiguration(destination="weaviate"), + DestinationTestConfiguration(destination="lancedb"), + ] if default_staging_configs or all_staging_configs: destination_configs += [ diff --git a/tests/utils.py b/tests/utils.py index 47b6561c8e..bf3aafdb77 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -45,13 +45,22 @@ "motherduck", "mssql", "qdrant", + "lancedb", "destination", "synapse", "databricks", "clickhouse", "dremio", } -NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant", "destination"} +NON_SQL_DESTINATIONS = { + "filesystem", + "weaviate", + "dummy", + "motherduck", + "qdrant", + "lancedb", + "destination", +} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS # exclude destination configs (for now used for athena and athena iceberg separation)