From cb7a5a82a5238b1f1f56c9b91497819f567f4841 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Thu, 12 Oct 2023 10:17:32 +0200 Subject: [PATCH 01/22] add OpenAI-compatible embedding --- .../connectors/destination-chroma/Dockerfile | 2 +- .../destination_chroma/config.py | 2 + .../destination_chroma/destination.py | 21 +------- .../destination-chroma/metadata.yaml | 2 +- .../connectors/destination-chroma/setup.py | 2 +- .../connectors/destination-milvus/Dockerfile | 2 +- .../destination_milvus/config.py | 2 + .../destination_milvus/destination.py | 20 +------ .../integration_tests/spec.json | 52 +++++++++++++++++++ .../destination-milvus/metadata.yaml | 2 +- .../connectors/destination-milvus/setup.py | 2 +- .../destination-pinecone/Dockerfile | 2 +- .../destination_pinecone/config.py | 3 +- .../destination_pinecone/destination.py | 11 +--- .../integration_tests/spec.json | 52 +++++++++++++++++++ .../destination-pinecone/metadata.yaml | 2 +- .../connectors/destination-pinecone/setup.py | 2 +- .../connectors/destination-qdrant/Dockerfile | 2 +- .../destination_qdrant/config.py | 2 + .../destination_qdrant/destination.py | 20 +------ .../destination-qdrant/metadata.yaml | 2 +- .../connectors/destination-qdrant/setup.py | 2 +- .../destination-weaviate/Dockerfile | 2 +- .../destination_weaviate/config.py | 2 + .../destination_weaviate/destination.py | 20 +------ .../integration_tests/spec.json | 52 +++++++++++++++++++ .../destination-weaviate/metadata.yaml | 2 +- .../connectors/destination-weaviate/setup.py | 2 +- 28 files changed, 191 insertions(+), 98 deletions(-) diff --git a/airbyte-integrations/connectors/destination-chroma/Dockerfile b/airbyte-integrations/connectors/destination-chroma/Dockerfile index 6c74f55542a0..be1addcaf4cb 100644 --- a/airbyte-integrations/connectors/destination-chroma/Dockerfile +++ b/airbyte-integrations/connectors/destination-chroma/Dockerfile @@ -41,5 +41,5 @@ COPY destination_chroma ./destination_chroma ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.0.3 +LABEL io.airbyte.version=0.0.4 LABEL io.airbyte.name=airbyte/destination-chroma diff --git a/airbyte-integrations/connectors/destination-chroma/destination_chroma/config.py b/airbyte-integrations/connectors/destination-chroma/destination_chroma/config.py index 9f60ad022cfb..ccb746924167 100644 --- a/airbyte-integrations/connectors/destination-chroma/destination_chroma/config.py +++ b/airbyte-integrations/connectors/destination-chroma/destination_chroma/config.py @@ -9,6 +9,7 @@ CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, OpenAIEmbeddingConfigModel, ProcessingConfigModel, ) @@ -73,6 +74,7 @@ class ConfigModel(BaseModel): FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, NoEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") indexing: ChromaIndexingConfigModel diff --git a/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py b/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py index 23d1467d7dd9..87bb115313a4 100644 --- a/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py +++ b/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py @@ -8,12 +8,8 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination from airbyte_cdk.destinations.vector_db_based.embedder import ( - AzureOpenAIEmbedder, - CohereEmbedder, + create_from_config, Embedder, - FakeEmbedder, - FromFieldEmbedder, - OpenAIEmbedder, ) from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer @@ -31,26 +27,13 @@ BATCH_SIZE = 128 -embedder_map = { - "openai": OpenAIEmbedder, - "azure_openai": AzureOpenAIEmbedder, - "cohere": CohereEmbedder, - "fake": FakeEmbedder, - "from_field": FromFieldEmbedder, - "no_embedding": NoEmbedder, -} - - class DestinationChroma(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - if config.embedding.mode == "azure_openai" or config.embedding.mode == "openai": - self.embedder = embedder_map[config.embedding.mode](config.embedding, config.processing.chunk_size) - else: - self.embedder = embedder_map[config.embedding.mode](config.embedding) + self.embedder = create_from_config(config.embedding, config.processing) if config.embedding.mode != "no_embedding" else NoEmbedder(config.embedding) self.indexer = ChromaIndexer(config.indexing) def write( diff --git a/airbyte-integrations/connectors/destination-chroma/metadata.yaml b/airbyte-integrations/connectors/destination-chroma/metadata.yaml index 627eb3c77e04..b25518878202 100644 --- a/airbyte-integrations/connectors/destination-chroma/metadata.yaml +++ b/airbyte-integrations/connectors/destination-chroma/metadata.yaml @@ -7,7 +7,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 0b75218b-f702-4a28-85ac-34d3d84c0fc2 - dockerImageTag: 0.0.3 + dockerImageTag: 0.0.4 dockerRepository: airbyte/destination-chroma githubIssueLabel: destination-chroma icon: chroma.svg diff --git a/airbyte-integrations/connectors/destination-chroma/setup.py b/airbyte-integrations/connectors/destination-chroma/setup.py index bc43a2e3a852..29f3e143486a 100644 --- a/airbyte-integrations/connectors/destination-chroma/setup.py +++ b/airbyte-integrations/connectors/destination-chroma/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.28", + "airbyte-cdk[vector-db-based]==0.51.30", "chromadb", ] diff --git a/airbyte-integrations/connectors/destination-milvus/Dockerfile b/airbyte-integrations/connectors/destination-milvus/Dockerfile index ad39f0417f94..cb5a56d2c8b3 100644 --- a/airbyte-integrations/connectors/destination-milvus/Dockerfile +++ b/airbyte-integrations/connectors/destination-milvus/Dockerfile @@ -37,5 +37,5 @@ COPY destination_milvus ./destination_milvus ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.0.4 +LABEL io.airbyte.version=0.0.5 LABEL io.airbyte.name=airbyte/destination-milvus diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py index 05ab90289ce9..b976db116314 100644 --- a/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py +++ b/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py @@ -12,6 +12,7 @@ FromFieldEmbeddingConfigModel, OpenAIEmbeddingConfigModel, ProcessingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs from pydantic import BaseModel, Field @@ -78,6 +79,7 @@ class ConfigModel(BaseModel): FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") indexing: MilvusIndexingConfigModel diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py index 5e5fd4ba2ecb..4c5a5b0a8797 100644 --- a/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py +++ b/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py @@ -8,12 +8,8 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination from airbyte_cdk.destinations.vector_db_based.embedder import ( - AzureOpenAIEmbedder, - CohereEmbedder, + create_from_config, Embedder, - FakeEmbedder, - FromFieldEmbedder, - OpenAIEmbedder, ) from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer @@ -25,24 +21,12 @@ BATCH_SIZE = 128 -embedder_map = { - "openai": OpenAIEmbedder, - "cohere": CohereEmbedder, - "fake": FakeEmbedder, - "azure_openai": AzureOpenAIEmbedder, - "from_field": FromFieldEmbedder, -} - - class DestinationMilvus(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - if config.embedding.mode == "azure_openai" or config.embedding.mode == "openai": - self.embedder = embedder_map[config.embedding.mode](config.embedding, config.processing.chunk_size) - else: - self.embedder = embedder_map[config.embedding.mode](config.embedding) + self.embedder = create_from_config(config.embedding, config.processing) self.indexer = MilvusIndexer(config.indexing, self.embedder.embedding_dimensions) def write( diff --git a/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json b/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json index 6d5e86d97d2c..be23c0ec4e77 100644 --- a/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json @@ -255,6 +255,58 @@ }, "required": ["openai_key", "api_base", "deployment"], "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions." + }, + { + "title": "OpenAI-compatible", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "openai_compatible", + "const": "openai_compatible", + "enum": [ + "openai_compatible" + ], + "type": "string" + }, + "api_key": { + "title": "API key", + "default": "", + "airbyte_secret": true, + "type": "string" + }, + "base_url": { + "title": "Base URL", + "description": "The base URL for your OpenAI-compatible service", + "examples": [ + "https://your-service-name.com" + ], + "type": "string" + }, + "model_name": { + "title": "Model name", + "description": "The name of the model to use for embedding", + "default": "text-embedding-ada-002", + "examples": [ + "text-embedding-ada-002" + ], + "type": "string" + }, + "dimensions": { + "title": "Embedding dimensions", + "description": "The number of dimensions the embedding model is generating", + "examples": [ + 1536, + 384 + ], + "type": "integer" + } + }, + "required": [ + "base_url", + "dimensions" + ], + "description": "Use a service that's compatible with the OpenAI API to embed text." } ] }, diff --git a/airbyte-integrations/connectors/destination-milvus/metadata.yaml b/airbyte-integrations/connectors/destination-milvus/metadata.yaml index f190dcf46808..dce6f4b1222d 100644 --- a/airbyte-integrations/connectors/destination-milvus/metadata.yaml +++ b/airbyte-integrations/connectors/destination-milvus/metadata.yaml @@ -20,7 +20,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 65de8962-48c9-11ee-be56-0242ac120002 - dockerImageTag: 0.0.4 + dockerImageTag: 0.0.5 dockerRepository: airbyte/destination-milvus githubIssueLabel: destination-milvus icon: milvus.svg diff --git a/airbyte-integrations/connectors/destination-milvus/setup.py b/airbyte-integrations/connectors/destination-milvus/setup.py index bf909b74b88b..0743fb9d5ccd 100644 --- a/airbyte-integrations/connectors/destination-milvus/setup.py +++ b/airbyte-integrations/connectors/destination-milvus/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.28", "pymilvus==2.3.0"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "pymilvus==2.3.0"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-pinecone/Dockerfile b/airbyte-integrations/connectors/destination-pinecone/Dockerfile index 152569d21d99..c010916dcde3 100644 --- a/airbyte-integrations/connectors/destination-pinecone/Dockerfile +++ b/airbyte-integrations/connectors/destination-pinecone/Dockerfile @@ -38,6 +38,6 @@ COPY destination_pinecone ./destination_pinecone ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.0.15 +LABEL io.airbyte.version=0.0.16 LABEL io.airbyte.name=airbyte/destination-pinecone diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py index 5b0ee1797116..058b7562ce68 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py @@ -10,6 +10,7 @@ CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, OpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ProcessingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs @@ -40,7 +41,7 @@ class ConfigModel(BaseModel): indexing: PineconeIndexingModel embedding: Union[ - OpenAIEmbeddingConfigModel, CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel + OpenAIEmbeddingConfigModel, CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") processing: ProcessingConfigModel diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py index adb8c466c300..b5a86ec5bc63 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py @@ -7,7 +7,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import AzureOpenAIEmbedder, CohereEmbedder, Embedder, FakeEmbedder, OpenAIEmbedder +from airbyte_cdk.destinations.vector_db_based.embedder import create_from_config, Embedder from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Status @@ -17,19 +17,12 @@ BATCH_SIZE = 32 - -embedder_map = {"openai": OpenAIEmbedder, "cohere": CohereEmbedder, "fake": FakeEmbedder, "azure_openai": AzureOpenAIEmbedder} - - class DestinationPinecone(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - if config.embedding.mode == "azure_openai" or config.embedding.mode == "openai": - self.embedder = embedder_map[config.embedding.mode](config.embedding, config.processing.chunk_size) - else: - self.embedder = embedder_map[config.embedding.mode](config.embedding) + self.embedder = create_from_config(config.embedding, config.processing) self.indexer = PineconeIndexer(config.indexing, self.embedder.embedding_dimensions) def write( diff --git a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json index 37e5719e223f..91c9a094a630 100644 --- a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json @@ -122,6 +122,58 @@ }, "required": ["openai_key", "api_base", "deployment"], "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions." + }, + { + "title": "OpenAI-compatible", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "openai_compatible", + "const": "openai_compatible", + "enum": [ + "openai_compatible" + ], + "type": "string" + }, + "api_key": { + "title": "API key", + "default": "", + "airbyte_secret": true, + "type": "string" + }, + "base_url": { + "title": "Base URL", + "description": "The base URL for your OpenAI-compatible service", + "examples": [ + "https://your-service-name.com" + ], + "type": "string" + }, + "model_name": { + "title": "Model name", + "description": "The name of the model to use for embedding", + "default": "text-embedding-ada-002", + "examples": [ + "text-embedding-ada-002" + ], + "type": "string" + }, + "dimensions": { + "title": "Embedding dimensions", + "description": "The number of dimensions the embedding model is generating", + "examples": [ + 1536, + 384 + ], + "type": "integer" + } + }, + "required": [ + "base_url", + "dimensions" + ], + "description": "Use a service that's compatible with the OpenAI API to embed text." } ] }, diff --git a/airbyte-integrations/connectors/destination-pinecone/metadata.yaml b/airbyte-integrations/connectors/destination-pinecone/metadata.yaml index 1f2ad88f09d7..acba2b7189e9 100644 --- a/airbyte-integrations/connectors/destination-pinecone/metadata.yaml +++ b/airbyte-integrations/connectors/destination-pinecone/metadata.yaml @@ -20,7 +20,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 3d2b6f84-7f0d-4e3f-a5e5-7c7d4b50eabd - dockerImageTag: 0.0.15 + dockerImageTag: 0.0.16 dockerRepository: airbyte/destination-pinecone githubIssueLabel: destination-pinecone icon: pinecone.svg diff --git a/airbyte-integrations/connectors/destination-pinecone/setup.py b/airbyte-integrations/connectors/destination-pinecone/setup.py index 7cadc04b30b5..57add87ef602 100644 --- a/airbyte-integrations/connectors/destination-pinecone/setup.py +++ b/airbyte-integrations/connectors/destination-pinecone/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.28", + "airbyte-cdk[vector-db-based]==0.51.30", "pinecone-client[grpc]", ] diff --git a/airbyte-integrations/connectors/destination-qdrant/Dockerfile b/airbyte-integrations/connectors/destination-qdrant/Dockerfile index f56ab5821b1f..996100035672 100644 --- a/airbyte-integrations/connectors/destination-qdrant/Dockerfile +++ b/airbyte-integrations/connectors/destination-qdrant/Dockerfile @@ -41,5 +41,5 @@ COPY destination_qdrant ./destination_qdrant ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.0.4 +LABEL io.airbyte.version=0.0.5 LABEL io.airbyte.name=airbyte/destination-qdrant diff --git a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py index edbd9398a377..18c545e88b21 100644 --- a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py +++ b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py @@ -14,6 +14,7 @@ FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, OpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ProcessingConfigModel, ) from jsonschema import RefResolver @@ -67,6 +68,7 @@ class ConfigModel(BaseModel): FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") indexing: QdrantIndexingConfigModel diff --git a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py index 3d67e13a0d4a..8c07ebd5114c 100644 --- a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py +++ b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py @@ -7,12 +7,8 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination from airbyte_cdk.destinations.vector_db_based.embedder import ( - AzureOpenAIEmbedder, - CohereEmbedder, + create_from_config, Embedder, - FakeEmbedder, - FromFieldEmbedder, - OpenAIEmbedder, ) from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer @@ -23,24 +19,12 @@ BATCH_SIZE = 256 -embedder_map = { - "openai": OpenAIEmbedder, - "azure_openai": AzureOpenAIEmbedder, - "cohere": CohereEmbedder, - "fake": FakeEmbedder, - "from_field": FromFieldEmbedder, -} - - class DestinationQdrant(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - if config.embedding.mode == "azure_openai" or config.embedding.mode == "openai": - self.embedder = embedder_map[config.embedding.mode](config.embedding, config.processing.chunk_size) - else: - self.embedder = embedder_map[config.embedding.mode](config.embedding) + self.embedder = create_from_config(config.embedding, config.processing) self.indexer = QdrantIndexer(config.indexing, self.embedder.embedding_dimensions) def write( diff --git a/airbyte-integrations/connectors/destination-qdrant/metadata.yaml b/airbyte-integrations/connectors/destination-qdrant/metadata.yaml index b105f760724d..bed023420ac5 100644 --- a/airbyte-integrations/connectors/destination-qdrant/metadata.yaml +++ b/airbyte-integrations/connectors/destination-qdrant/metadata.yaml @@ -20,7 +20,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 6eb1198a-6d38-43e5-aaaa-dccd8f71db2b - dockerImageTag: 0.0.4 + dockerImageTag: 0.0.5 dockerRepository: airbyte/destination-qdrant githubIssueLabel: destination-qdrant icon: qdrant.svg diff --git a/airbyte-integrations/connectors/destination-qdrant/setup.py b/airbyte-integrations/connectors/destination-qdrant/setup.py index a679e7d29927..fe20ebc3cc65 100644 --- a/airbyte-integrations/connectors/destination-qdrant/setup.py +++ b/airbyte-integrations/connectors/destination-qdrant/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.28", "qdrant-client", "fastembed"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "qdrant-client", "fastembed"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-weaviate/Dockerfile b/airbyte-integrations/connectors/destination-weaviate/Dockerfile index eca78a55aac2..9caf17a11ec2 100644 --- a/airbyte-integrations/connectors/destination-weaviate/Dockerfile +++ b/airbyte-integrations/connectors/destination-weaviate/Dockerfile @@ -37,5 +37,5 @@ COPY destination_weaviate ./destination_weaviate ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.1 +LABEL io.airbyte.version=0.2.2 LABEL io.airbyte.name=airbyte/destination-weaviate \ No newline at end of file diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py index ddd825732e49..fb1a596d9cc8 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py @@ -11,6 +11,7 @@ FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, OpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ProcessingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs @@ -113,6 +114,7 @@ class ConfigModel(BaseModel): CohereEmbeddingConfigModel, FromFieldEmbeddingConfigModel, FakeEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") indexing: WeaviateIndexingConfigModel diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py index 00a38607e35c..711b3ca34e62 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py @@ -8,12 +8,8 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination from airbyte_cdk.destinations.vector_db_based.embedder import ( - AzureOpenAIEmbedder, - CohereEmbedder, + create_from_config, Embedder, - FakeEmbedder, - FromFieldEmbedder, - OpenAIEmbedder, ) from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer @@ -23,25 +19,13 @@ from destination_weaviate.indexer import WeaviateIndexer from destination_weaviate.no_embedder import NoEmbedder -embedder_map = { - "openai": OpenAIEmbedder, - "cohere": CohereEmbedder, - "fake": FakeEmbedder, - "from_field": FromFieldEmbedder, - "no_embedding": NoEmbedder, - "azure_openai": AzureOpenAIEmbedder, -} - class DestinationWeaviate(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - if config.embedding.mode == "azure_openai" or config.embedding.mode == "openai": - self.embedder = embedder_map[config.embedding.mode](config.embedding, config.processing.chunk_size) - else: - self.embedder = embedder_map[config.embedding.mode](config.embedding) + self.embedder = create_from_config(config.embedding, config.processing) if config.embedding.mode != "no_embedding" else NoEmbedder(config.embedding) self.indexer = WeaviateIndexer(config.indexing) def write( diff --git a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json index fb8bf6594fd1..90ea6a0b9d1f 100644 --- a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json @@ -269,6 +269,58 @@ } }, "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs." + }, + { + "title": "OpenAI-compatible", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "openai_compatible", + "const": "openai_compatible", + "enum": [ + "openai_compatible" + ], + "type": "string" + }, + "api_key": { + "title": "API key", + "default": "", + "airbyte_secret": true, + "type": "string" + }, + "base_url": { + "title": "Base URL", + "description": "The base URL for your OpenAI-compatible service", + "examples": [ + "https://your-service-name.com" + ], + "type": "string" + }, + "model_name": { + "title": "Model name", + "description": "The name of the model to use for embedding", + "default": "text-embedding-ada-002", + "examples": [ + "text-embedding-ada-002" + ], + "type": "string" + }, + "dimensions": { + "title": "Embedding dimensions", + "description": "The number of dimensions the embedding model is generating", + "examples": [ + 1536, + 384 + ], + "type": "integer" + } + }, + "required": [ + "base_url", + "dimensions" + ], + "description": "Use a service that's compatible with the OpenAI API to embed text." } ] }, diff --git a/airbyte-integrations/connectors/destination-weaviate/metadata.yaml b/airbyte-integrations/connectors/destination-weaviate/metadata.yaml index 86d64773d856..c13fc1973062 100644 --- a/airbyte-integrations/connectors/destination-weaviate/metadata.yaml +++ b/airbyte-integrations/connectors/destination-weaviate/metadata.yaml @@ -8,7 +8,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 7b7d7a0d-954c-45a0-bcfc-39a634b97736 - dockerImageTag: 0.2.1 + dockerImageTag: 0.2.2 dockerRepository: airbyte/destination-weaviate githubIssueLabel: destination-weaviate icon: weaviate.svg diff --git a/airbyte-integrations/connectors/destination-weaviate/setup.py b/airbyte-integrations/connectors/destination-weaviate/setup.py index dcb65d0c8b73..cb5c97dac60c 100644 --- a/airbyte-integrations/connectors/destination-weaviate/setup.py +++ b/airbyte-integrations/connectors/destination-weaviate/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.28", "weaviate-client==3.23.2"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "weaviate-client==3.23.2"] TEST_REQUIREMENTS = ["pytest~=6.2", "docker", "pytest-docker"] From 9c325265de4e4585aeca9860a6cce56a4cf4a528 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Thu, 12 Oct 2023 10:20:26 +0200 Subject: [PATCH 02/22] add changelog --- docs/integrations/destinations/chroma.md | 1 + docs/integrations/destinations/milvus.md | 1 + docs/integrations/destinations/pinecone.md | 1 + docs/integrations/destinations/qdrant.md | 1 + docs/integrations/destinations/weaviate.md | 1 + 5 files changed, 5 insertions(+) diff --git a/docs/integrations/destinations/chroma.md b/docs/integrations/destinations/chroma.md index 7ebc7b78ca1d..b2042de948bc 100644 --- a/docs/integrations/destinations/chroma.md +++ b/docs/integrations/destinations/chroma.md @@ -75,6 +75,7 @@ You should now have all the requirements needed to configure Chroma as a destina | Version | Date | Pull Request | Subject | | :------ | :--------- | :--------------------------------------------------------- | :----------------------------------------- | +| 0.0.4 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.3 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.2 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | | 0.0.1 | 2023-09-08 | [#30023](https://github.com/airbytehq/airbyte/pull/30023) | 🎉 New Destination: Chroma (Vector Database) | diff --git a/docs/integrations/destinations/milvus.md b/docs/integrations/destinations/milvus.md index 9a9a7808601a..bf050d32befb 100644 --- a/docs/integrations/destinations/milvus.md +++ b/docs/integrations/destinations/milvus.md @@ -105,6 +105,7 @@ vector_store.similarity_search("test") | Version | Date | Pull Request | Subject | |:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.0.5 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.4 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.3 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | | 0.0.2 | 2023-08-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689) | Update CDK to support azure OpenAI embeddings and text splitting options, make sure primary key field is not accidentally set, promote to certified | diff --git a/docs/integrations/destinations/pinecone.md b/docs/integrations/destinations/pinecone.md index 3d4153f66ecf..54fb64f64ba9 100644 --- a/docs/integrations/destinations/pinecone.md +++ b/docs/integrations/destinations/pinecone.md @@ -74,6 +74,7 @@ OpenAI and Fake embeddings produce vectors with 1536 dimensions, and the Cohere | Version | Date | Pull Request | Subject | |:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.0.16 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.15 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.14 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | | 0.0.13 | 2023-09-26 | [#30649](https://github.com/airbytehq/airbyte/pull/30649) | Allow more text splitting options | diff --git a/docs/integrations/destinations/qdrant.md b/docs/integrations/destinations/qdrant.md index eb55b7a1669b..127cd7bf7785 100644 --- a/docs/integrations/destinations/qdrant.md +++ b/docs/integrations/destinations/qdrant.md @@ -70,6 +70,7 @@ You should now have all the requirements needed to configure Qdrant as a destina | Version | Date | Pull Request | Subject | | :------ | :--------- | :--------------------------------------------------------- | :----------------------------------------- | +| 0.0.5 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.4 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.3 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | | 0.0.2 | 2023-09-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689) | Update CDK to support Azure OpenAI embeddings and text splitting options | diff --git a/docs/integrations/destinations/weaviate.md b/docs/integrations/destinations/weaviate.md index bc23c60823eb..0d568a6dafa7 100644 --- a/docs/integrations/destinations/weaviate.md +++ b/docs/integrations/destinations/weaviate.md @@ -83,6 +83,7 @@ As properties have to start will a lowercase letter in Weaviate, field names mig | Version | Date | Pull Request | Subject | | :------ | :--------- | :--------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------- | +| 0.2.2 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.2.1 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size and conflict field name handling | | 0.2.0 | 2023-09-22 | [#30151](https://github.com/airbytehq/airbyte/pull/30151) | Add embedding capabilities, overwrite and dedup support and API key auth mode, make certified. 🚨 Breaking changes - check migrations guide. | | 0.1.1 | 2022-02-08 | [\#22527](https://github.com/airbytehq/airbyte/pull/22527) | Multiple bug fixes: Support String based IDs, arrays of uknown type and additionalProperties of type object and array of objects | From eda2bb46cdc92c62ba936a8b16a5b3fbebfb2a6d Mon Sep 17 00:00:00 2001 From: flash1293 Date: Thu, 12 Oct 2023 08:46:35 +0000 Subject: [PATCH 03/22] Automated Commit - Formatting Changes --- .../destination_chroma/destination.py | 12 +++++----- .../destination_milvus/config.py | 2 +- .../destination_milvus/destination.py | 5 +---- .../integration_tests/spec.json | 22 +++++-------------- .../destination_pinecone/config.py | 8 +++++-- .../destination_pinecone/destination.py | 3 ++- .../integration_tests/spec.json | 22 +++++-------------- .../destination_qdrant/config.py | 2 +- .../destination_qdrant/destination.py | 6 ++--- .../destination_weaviate/config.py | 2 +- .../destination_weaviate/destination.py | 11 +++++----- .../integration_tests/spec.json | 22 +++++-------------- 12 files changed, 42 insertions(+), 75 deletions(-) diff --git a/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py b/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py index 87bb115313a4..2d6207eb2e5f 100644 --- a/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py +++ b/airbyte-integrations/connectors/destination-chroma/destination_chroma/destination.py @@ -7,10 +7,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import ( - create_from_config, - Embedder, -) +from airbyte_cdk.destinations.vector_db_based.embedder import Embedder, create_from_config from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import ( @@ -27,13 +24,18 @@ BATCH_SIZE = 128 + class DestinationChroma(Destination): indexer: Indexer embedder: Embedder def _init_indexer(self, config: ConfigModel): - self.embedder = create_from_config(config.embedding, config.processing) if config.embedding.mode != "no_embedding" else NoEmbedder(config.embedding) + self.embedder = ( + create_from_config(config.embedding, config.processing) + if config.embedding.mode != "no_embedding" + else NoEmbedder(config.embedding) + ) self.indexer = ChromaIndexer(config.indexing) def write( diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py index b976db116314..7aa9ed5a62a2 100644 --- a/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py +++ b/airbyte-integrations/connectors/destination-milvus/destination_milvus/config.py @@ -10,9 +10,9 @@ CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, OpenAIEmbeddingConfigModel, ProcessingConfigModel, - OpenAICompatibleEmbeddingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs from pydantic import BaseModel, Field diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py index 4c5a5b0a8797..d5baa8f3b920 100644 --- a/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py +++ b/airbyte-integrations/connectors/destination-milvus/destination_milvus/destination.py @@ -7,10 +7,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import ( - create_from_config, - Embedder, -) +from airbyte_cdk.destinations.vector_db_based.embedder import Embedder, create_from_config from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Status diff --git a/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json b/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json index be23c0ec4e77..f61a2387dd9d 100644 --- a/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-milvus/integration_tests/spec.json @@ -264,9 +264,7 @@ "title": "Mode", "default": "openai_compatible", "const": "openai_compatible", - "enum": [ - "openai_compatible" - ], + "enum": ["openai_compatible"], "type": "string" }, "api_key": { @@ -278,34 +276,24 @@ "base_url": { "title": "Base URL", "description": "The base URL for your OpenAI-compatible service", - "examples": [ - "https://your-service-name.com" - ], + "examples": ["https://your-service-name.com"], "type": "string" }, "model_name": { "title": "Model name", "description": "The name of the model to use for embedding", "default": "text-embedding-ada-002", - "examples": [ - "text-embedding-ada-002" - ], + "examples": ["text-embedding-ada-002"], "type": "string" }, "dimensions": { "title": "Embedding dimensions", "description": "The number of dimensions the embedding model is generating", - "examples": [ - 1536, - 384 - ], + "examples": [1536, 384], "type": "integer" } }, - "required": [ - "base_url", - "dimensions" - ], + "required": ["base_url", "dimensions"], "description": "Use a service that's compatible with the OpenAI API to embed text." } ] diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py index 058b7562ce68..f33f29103797 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/config.py @@ -9,8 +9,8 @@ AzureOpenAIEmbeddingConfigModel, CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, - OpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, ProcessingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs @@ -41,7 +41,11 @@ class ConfigModel(BaseModel): indexing: PineconeIndexingModel embedding: Union[ - OpenAIEmbeddingConfigModel, CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel + OpenAIEmbeddingConfigModel, + CohereEmbeddingConfigModel, + FakeEmbeddingConfigModel, + AzureOpenAIEmbeddingConfigModel, + OpenAICompatibleEmbeddingConfigModel, ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object") processing: ProcessingConfigModel diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py index b5a86ec5bc63..1b5a613ad4d7 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py @@ -7,7 +7,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import create_from_config, Embedder +from airbyte_cdk.destinations.vector_db_based.embedder import Embedder, create_from_config from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Status @@ -17,6 +17,7 @@ BATCH_SIZE = 32 + class DestinationPinecone(Destination): indexer: Indexer embedder: Embedder diff --git a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json index 91c9a094a630..31e76e8d43b3 100644 --- a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json @@ -131,9 +131,7 @@ "title": "Mode", "default": "openai_compatible", "const": "openai_compatible", - "enum": [ - "openai_compatible" - ], + "enum": ["openai_compatible"], "type": "string" }, "api_key": { @@ -145,34 +143,24 @@ "base_url": { "title": "Base URL", "description": "The base URL for your OpenAI-compatible service", - "examples": [ - "https://your-service-name.com" - ], + "examples": ["https://your-service-name.com"], "type": "string" }, "model_name": { "title": "Model name", "description": "The name of the model to use for embedding", "default": "text-embedding-ada-002", - "examples": [ - "text-embedding-ada-002" - ], + "examples": ["text-embedding-ada-002"], "type": "string" }, "dimensions": { "title": "Embedding dimensions", "description": "The number of dimensions the embedding model is generating", - "examples": [ - 1536, - 384 - ], + "examples": [1536, 384], "type": "integer" } }, - "required": [ - "base_url", - "dimensions" - ], + "required": ["base_url", "dimensions"], "description": "Use a service that's compatible with the OpenAI API to embed text." } ] diff --git a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py index 18c545e88b21..1f79cffa7566 100644 --- a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py +++ b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/config.py @@ -13,8 +13,8 @@ CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, - OpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, ProcessingConfigModel, ) from jsonschema import RefResolver diff --git a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py index 8c07ebd5114c..faf58bc688cb 100644 --- a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py +++ b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/destination.py @@ -6,10 +6,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import ( - create_from_config, - Embedder, -) +from airbyte_cdk.destinations.vector_db_based.embedder import Embedder, create_from_config from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Status @@ -19,6 +16,7 @@ BATCH_SIZE = 256 + class DestinationQdrant(Destination): indexer: Indexer embedder: Embedder diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py index fb1a596d9cc8..935e07e18711 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py @@ -10,8 +10,8 @@ CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, FromFieldEmbeddingConfigModel, - OpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel, + OpenAIEmbeddingConfigModel, ProcessingConfigModel, ) from airbyte_cdk.utils.spec_schema_transformations import resolve_refs diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py index 711b3ca34e62..d7859cc41abc 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/destination.py @@ -7,10 +7,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.destinations import Destination -from airbyte_cdk.destinations.vector_db_based.embedder import ( - create_from_config, - Embedder, -) +from airbyte_cdk.destinations.vector_db_based.embedder import Embedder, create_from_config from airbyte_cdk.destinations.vector_db_based.indexer import Indexer from airbyte_cdk.destinations.vector_db_based.writer import Writer from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, Status @@ -25,7 +22,11 @@ class DestinationWeaviate(Destination): embedder: Embedder def _init_indexer(self, config: ConfigModel): - self.embedder = create_from_config(config.embedding, config.processing) if config.embedding.mode != "no_embedding" else NoEmbedder(config.embedding) + self.embedder = ( + create_from_config(config.embedding, config.processing) + if config.embedding.mode != "no_embedding" + else NoEmbedder(config.embedding) + ) self.indexer = WeaviateIndexer(config.indexing) def write( diff --git a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json index 90ea6a0b9d1f..27ba7bea9e48 100644 --- a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json @@ -278,9 +278,7 @@ "title": "Mode", "default": "openai_compatible", "const": "openai_compatible", - "enum": [ - "openai_compatible" - ], + "enum": ["openai_compatible"], "type": "string" }, "api_key": { @@ -292,34 +290,24 @@ "base_url": { "title": "Base URL", "description": "The base URL for your OpenAI-compatible service", - "examples": [ - "https://your-service-name.com" - ], + "examples": ["https://your-service-name.com"], "type": "string" }, "model_name": { "title": "Model name", "description": "The name of the model to use for embedding", "default": "text-embedding-ada-002", - "examples": [ - "text-embedding-ada-002" - ], + "examples": ["text-embedding-ada-002"], "type": "string" }, "dimensions": { "title": "Embedding dimensions", "description": "The number of dimensions the embedding model is generating", - "examples": [ - 1536, - 384 - ], + "examples": [1536, 384], "type": "integer" } }, - "required": [ - "base_url", - "dimensions" - ], + "required": ["base_url", "dimensions"], "description": "Use a service that's compatible with the OpenAI API to embed text." } ] From e2bbdd5b9a521472ace8f1f2962068549fb3d88a Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 09:25:27 +0200 Subject: [PATCH 04/22] adjust indexers --- .../destination_chroma/indexer.py | 4 +++- .../connectors/destination-chroma/setup.py | 2 +- .../unit_tests/test_indexer.py | 4 ++-- .../destination_milvus/indexer.py | 12 ++++++---- .../connectors/destination-milvus/setup.py | 2 +- .../unit_tests/indexer_test.py | 4 ++-- .../destination_pinecone/indexer.py | 8 ++++--- .../connectors/destination-pinecone/setup.py | 2 +- .../unit_tests/pinecone_indexer_test.py | 23 +++++++++++++++---- .../destination_qdrant/indexer.py | 4 +++- .../connectors/destination-qdrant/setup.py | 2 +- .../unit_tests/test_indexer.py | 5 ++-- .../destination_weaviate/indexer.py | 4 +++- .../connectors/destination-weaviate/setup.py | 2 +- .../unit_tests/indexer_test.py | 12 +++++----- 15 files changed, 58 insertions(+), 32 deletions(-) diff --git a/airbyte-integrations/connectors/destination-chroma/destination_chroma/indexer.py b/airbyte-integrations/connectors/destination-chroma/destination_chroma/indexer.py index 84d6fb24ce9c..57eb05f84bfc 100644 --- a/airbyte-integrations/connectors/destination-chroma/destination_chroma/indexer.py +++ b/airbyte-integrations/connectors/destination-chroma/destination_chroma/indexer.py @@ -46,9 +46,11 @@ def check(self): finally: del client - def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: + def delete(self, delete_ids, namespace, stream): if len(delete_ids) > 0: self._delete_by_filter(field_name=METADATA_RECORD_ID_FIELD, field_values=delete_ids) + + def index(self, document_chunks, namespace, stream): entities = [] for i in range(len(document_chunks)): chunk = document_chunks[i] diff --git a/airbyte-integrations/connectors/destination-chroma/setup.py b/airbyte-integrations/connectors/destination-chroma/setup.py index 29f3e143486a..513198c9c717 100644 --- a/airbyte-integrations/connectors/destination-chroma/setup.py +++ b/airbyte-integrations/connectors/destination-chroma/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.30", + "airbyte-cdk[vector-db-based]==0.51.33", "chromadb", ] diff --git a/airbyte-integrations/connectors/destination-chroma/unit_tests/test_indexer.py b/airbyte-integrations/connectors/destination-chroma/unit_tests/test_indexer.py index d35daa7ee080..f1a9bf493d57 100644 --- a/airbyte-integrations/connectors/destination-chroma/unit_tests/test_indexer.py +++ b/airbyte-integrations/connectors/destination-chroma/unit_tests/test_indexer.py @@ -149,11 +149,11 @@ def test_pre_sync_does_not_call_delete(self): self.mock_client.get_collection().delete.assert_not_called() def test_index_calls_insert(self): - self.chroma_indexer.index([Mock(metadata={"key": "value"}, page_content="some content", embedding=[1, 2, 3])], []) + self.chroma_indexer.index([Mock(metadata={"key": "value"}, page_content="some content", embedding=[1, 2, 3])], None, "some_stream") self.mock_client.get_collection().add.assert_called_once() def test_index_calls_delete(self): - self.chroma_indexer.index([], ["some_id"]) + self.chroma_indexer.delete(["some_id"], None, "some_stream") self.mock_client.get_collection().delete.assert_called_with(where={"_ab_record_id": {"$in": ["some_id"]}}) diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py index f9ef7262f7e5..c64c58d54fdd 100644 --- a/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py +++ b/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py @@ -107,11 +107,7 @@ def _normalize(self, metadata: dict) -> dict: return result - def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: - if len(delete_ids) > 0: - id_list_expr = ", ".join([f'"{id}"' for id in delete_ids]) - id_expr = f"{METADATA_RECORD_ID_FIELD} in [{id_list_expr}]" - self._delete_for_filter(id_expr) + def index(self, document_chunks, namespace, stream): entities = [] for i in range(len(document_chunks)): chunk = document_chunks[i] @@ -119,3 +115,9 @@ def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: {**self._normalize(chunk.metadata), self.config.vector_field: chunk.embedding, self.config.text_field: chunk.page_content} ) self._collection.insert(entities) + + def delete(self, delete_ids, namespace, stream): + if len(delete_ids) > 0: + id_list_expr = ", ".join([f'"{id}"' for id in delete_ids]) + id_expr = f"{METADATA_RECORD_ID_FIELD} in [{id_list_expr}]" + self._delete_for_filter(id_expr) diff --git a/airbyte-integrations/connectors/destination-milvus/setup.py b/airbyte-integrations/connectors/destination-milvus/setup.py index 0743fb9d5ccd..6df8ecdfd819 100644 --- a/airbyte-integrations/connectors/destination-milvus/setup.py +++ b/airbyte-integrations/connectors/destination-milvus/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "pymilvus==2.3.0"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "pymilvus==2.3.0"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py index f24b50fe32c2..0679a26a960b 100644 --- a/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py @@ -142,7 +142,7 @@ def test_pre_sync_does_not_call_delete(self): def test_index_calls_insert(self): self.milvus_indexer._primary_key = "id" - self.milvus_indexer.index([Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1, 2, 3])], []) + self.milvus_indexer.index([Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1, 2, 3])], None, "some_stream") self.milvus_indexer._collection.insert.assert_called_with([{"key": "value", "vector": [1, 2, 3], "text": "some content", "_id": 5}]) @@ -151,7 +151,7 @@ def test_index_calls_delete(self): mock_iterator.next.side_effect = [[{"id": "123"}, {"id": "456"}], [{"id": "789"}], []] self.milvus_indexer._collection.query_iterator.return_value = mock_iterator - self.milvus_indexer.index([], ["some_id"]) + self.milvus_indexer.delete(["some_id"], None, "some_stream") self.milvus_indexer._collection.query_iterator.assert_called_with(expr='_ab_record_id in ["some_id"]') self.milvus_indexer._collection.delete.assert_has_calls([call(expr="id in [123, 456]"), call(expr="id in [789]")], any_order=False) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py index 521504a9856f..5a2266971089 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py @@ -82,9 +82,7 @@ def _truncate_metadata(self, metadata: dict) -> dict: return result - def index(self, document_chunks, delete_ids): - if len(delete_ids) > 0: - self.delete_vectors(filter={METADATA_RECORD_ID_FIELD: {"$in": delete_ids}}) + def index(self, document_chunks, namespace, stream): pinecone_docs = [] for i in range(len(document_chunks)): chunk = document_chunks[i] @@ -100,6 +98,10 @@ def index(self, document_chunks, delete_ids): # Wait for and retrieve responses (this raises in case of error) [async_result.result() for async_result in async_results] + def delete(self, delete_ids, namespace, stream): + if len(delete_ids) > 0: + self.delete_vectors(filter={METADATA_RECORD_ID_FIELD: {"$in": delete_ids}}) + def check(self) -> Optional[str]: try: description = pinecone.describe_index(self.config.index) diff --git a/airbyte-integrations/connectors/destination-pinecone/setup.py b/airbyte-integrations/connectors/destination-pinecone/setup.py index 57add87ef602..70a00165abca 100644 --- a/airbyte-integrations/connectors/destination-pinecone/setup.py +++ b/airbyte-integrations/connectors/destination-pinecone/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.30", + "airbyte-cdk[vector-db-based]==0.51.33", "pinecone-client[grpc]", ] diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index 89a8751cf6a7..96a082d67e07 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -51,7 +51,13 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): Mock(page_content="test", metadata={"_ab_stream": "abc"}, embedding=[1, 2, 3]), Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], + None, + "some_stream" + ) + indexer.index( ["delete_id1", "delete_id2"], + None, + "some_stram" ) indexer.pinecone_index.delete.assert_called_with(filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}) indexer.pinecone_index.upsert.assert_called_with( @@ -77,7 +83,13 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): Mock(page_content="test", metadata={"_ab_stream": "abc"}, embedding=[1, 2, 3]), Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], + None, + "some_stream" + ) + indexer.index( ["delete_id1", "delete_id2"], + None, + "some_stram" ) indexer.pinecone_index.query.assert_called_with( vector=[0, 0, 0], filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, top_k=10_000 @@ -100,9 +112,10 @@ def test_pinecone_index_delete_1k_limit(mock_describe_index): MagicMock(matches=[MagicMock(id=f"doc_id_{str(i)}") for i in range(1300)]), MagicMock(matches=[]), ] - indexer.index( - [], + indexer.delete( ["delete_id1"], + None, + "some_stream" ) indexer.pinecone_index.delete.assert_has_calls( [call(ids=[f"doc_id_{str(i)}" for i in range(1000)]), call(ids=[f"doc_id_{str(i+1000)}" for i in range(300)])] @@ -123,7 +136,8 @@ def test_pinecone_index_upsert_batching(): indexer = create_pinecone_indexer() indexer.index( [Mock(page_content=f"test {i}", metadata={"_ab_stream": "abc"}, embedding=[i, i, i]) for i in range(50)], - [], + None, + "some_stream", ) assert indexer.pinecone_index.upsert.call_count == 2 for i in range(40): @@ -232,7 +246,8 @@ def test_metadata_normalization(): }, ), ], - [], + None, + "some_stream", ) indexer.pinecone_index.upsert.assert_called_with( vectors=((ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test", "small": "a", "id": 1}),), diff --git a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/indexer.py b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/indexer.py index 74a8c60dceec..ab526f493d8b 100644 --- a/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/indexer.py +++ b/airbyte-integrations/connectors/destination-qdrant/destination_qdrant/indexer.py @@ -85,7 +85,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: collection_name=self.config.collection, field_name=field, field_schema=PayloadSchemaType.KEYWORD ) - def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: + def delete(self, delete_ids, namespace, stream): if len(delete_ids) > 0: self._delete_for_filter( models.FilterSelector( @@ -96,6 +96,8 @@ def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: ) ) ) + + def index(self, document_chunks, namespace, stream): entities = [] for i in range(len(document_chunks)): chunk = document_chunks[i] diff --git a/airbyte-integrations/connectors/destination-qdrant/setup.py b/airbyte-integrations/connectors/destination-qdrant/setup.py index fe20ebc3cc65..78112aae02b9 100644 --- a/airbyte-integrations/connectors/destination-qdrant/setup.py +++ b/airbyte-integrations/connectors/destination-qdrant/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "qdrant-client", "fastembed"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "qdrant-client", "fastembed"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-qdrant/unit_tests/test_indexer.py b/airbyte-integrations/connectors/destination-qdrant/unit_tests/test_indexer.py index 047ff340866f..eef6619302aa 100644 --- a/airbyte-integrations/connectors/destination-qdrant/unit_tests/test_indexer.py +++ b/airbyte-integrations/connectors/destination-qdrant/unit_tests/test_indexer.py @@ -150,13 +150,14 @@ def test_index_calls_insert(self): Mock(metadata={"key": "value1"}, page_content="some content", embedding=[1.0, 2.0, 3.0]), Mock(metadata={"key": "value2"}, page_content="some other content", embedding=[4.0, 5.0, 6.0]), ], - [], + None, + "some_stream", ) self.qdrant_indexer._client.upload_records.assert_called_once() def test_index_calls_delete(self): - self.qdrant_indexer.index([], ["some_id", "another_id"]) + self.qdrant_indexer.delete(["some_id", "another_id"], None, "some_stream") self.qdrant_indexer._client.delete.assert_called_with( collection_name=self.mock_config.collection, diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 6d44278f028a..9593470cfd5c 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -108,7 +108,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: prop.get("name") == METADATA_RECORD_ID_FIELD for prop in schema.get("properties", {}) ) - def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: + def delete(self, delete_ids, namespace, stream): if len(delete_ids) > 0: # Delete ids in all classes that have the record id metadata for class_name in self.has_record_id_metadata.keys(): @@ -117,6 +117,8 @@ def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None: class_name=class_name, where={"path": [METADATA_RECORD_ID_FIELD], "operator": "ContainsAny", "valueStringArray": delete_ids}, ) + + def index(self, document_chunks, namespace, stream): if len(document_chunks) == 0: return diff --git a/airbyte-integrations/connectors/destination-weaviate/setup.py b/airbyte-integrations/connectors/destination-weaviate/setup.py index cb5c97dac60c..f67c52daa740 100644 --- a/airbyte-integrations/connectors/destination-weaviate/setup.py +++ b/airbyte-integrations/connectors/destination-weaviate/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.30", "weaviate-client==3.23.2"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "weaviate-client==3.23.2"] TEST_REQUIREMENTS = ["pytest~=6.2", "docker", "pytest-docker"] diff --git a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py index 048ebf81ce4a..f854b0d7e42d 100644 --- a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py @@ -96,7 +96,7 @@ def test_index_deletes_by_record_id(self): mock_client = Mock() self.indexer.client = mock_client self.indexer.has_record_id_metadata = {"Test": True} - self.indexer.index([], ["some_id", "some_other_id"]) + self.indexer.delete(["some_id", "some_other_id"], None, "some_stream") mock_client.batch.delete_objects.assert_called_with( class_name="Test", where={"path": ["_ab_record_id"], "operator": "ContainsAny", "valueStringArray": ["some_id", "some_other_id"]}, @@ -107,7 +107,7 @@ def test_index_not_delete_no_metadata_field(self, MockClient): mock_client = Mock() MockClient.return_value = mock_client self.indexer.has_record_id_metadata = {"Test": False} - self.indexer.index([], ["some_id"]) + self.indexer.delete(["some_id"], None, "some_stream") mock_client.batch.delete_objects.assert_not_called() def test_index_flushes_batch(self): @@ -126,7 +126,7 @@ def test_index_flushes_batch(self): metadata={"someField": "some_value2"}, record=AirbyteRecordMessage(stream="test", data={"someField": "some_value"}, emitted_at=0), ) - self.indexer.index([mock_chunk1, mock_chunk2], []) + self.indexer.index([mock_chunk1, mock_chunk2], None, "some_stream") mock_client.batch.create_objects.assert_called() chunk1_call = call({"someField": "some_value", "text": "some_content"}, "Test", ANY, vector=[1, 2, 3]) chunk2_call = call({"someField": "some_value2", "text": "some_other_content"}, "Test", ANY, vector=[4, 5, 6]) @@ -155,7 +155,7 @@ def test_index_splits_batch(self): metadata={"someField": "some_value3"}, record=AirbyteRecordMessage(stream="test", data={"someField": "some_value3"}, emitted_at=0), ) - self.indexer.index([mock_chunk1, mock_chunk2, mock_chunk3], []) + self.indexer.index([mock_chunk1, mock_chunk2, mock_chunk3], None, "some_stream") assert mock_client.batch.create_objects.call_count == 2 @patch("destination_weaviate.indexer.uuid.uuid4") @@ -178,7 +178,7 @@ def test_index_flushes_batch_and_retries(self, MockTime, MockUUID): record=AirbyteRecordMessage(stream="test", data={"someField": "some_value"}, emitted_at=0), ) with self.assertRaises(WeaviatePartialBatchError): - self.indexer.index([mock_chunk1, mock_chunk2], []) + self.indexer.index([mock_chunk1, mock_chunk2], None, "some_stream") chunk1_call = call({"someField": "some_value", "text": "some_content"}, "Test", "some_id", vector=[1, 2, 3]) chunk2_call = call({"someField": "some_value2", "text": "some_other_content"}, "Test", "some_id2", vector=[4, 5, 6]) self.assertEqual(mock_client.batch.create_objects.call_count, 3) # 1 initial try + 2 retries @@ -196,7 +196,7 @@ def test_index_flushes_batch_and_normalizes(self): metadata={"someField": "some_value", "complex": {"a": [1, 2, 3]}, "UPPERCASE_NAME": "abc"}, record=AirbyteRecordMessage(stream="test", data={"someField": "some_value"}, emitted_at=0), ) - self.indexer.index([mock_chunk], []) + self.indexer.index([mock_chunk], None, "some_stream") mock_client.batch.add_data_object.assert_called_with( {"someField": "some_value", "complex": '{"a": [1, 2, 3]}', "uPPERCASE_NAME": "abc", "text": "some_content"}, "Test", From da171995225060e87972493d6db3fa2c7e8c5d64 Mon Sep 17 00:00:00 2001 From: alafanechere Date: Fri, 13 Oct 2023 09:39:52 +0200 Subject: [PATCH 05/22] use GITHUB_TOKEN instead of GH_PAT_MAINTENANCE_OCTAVIA to avoid GHA rate limit --- .github/workflows/label-github-issues-by-path.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-github-issues-by-path.yml b/.github/workflows/label-github-issues-by-path.yml index 1b18f9ec5512..5025edec8f4d 100644 --- a/.github/workflows/label-github-issues-by-path.yml +++ b/.github/workflows/label-github-issues-by-path.yml @@ -13,5 +13,5 @@ jobs: - name: "Label PR based on changed files" uses: actions/labeler@v3 with: - repo-token: "${{ secrets.GH_PAT_MAINTENANCE_OCTAVIA }}" + repo-token: "${{ secrets.GITHUB_TOKEN }}" sync-labels: true From 024561e3aee01dd31ac18b3fc787721e1028b645 Mon Sep 17 00:00:00 2001 From: alafanechere Date: Fri, 13 Oct 2023 09:47:20 +0200 Subject: [PATCH 06/22] Revert "use GITHUB_TOKEN instead of GH_PAT_MAINTENANCE_OCTAVIA to avoid GHA rate limit" This reverts commit da171995225060e87972493d6db3fa2c7e8c5d64. --- .github/workflows/label-github-issues-by-path.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/label-github-issues-by-path.yml b/.github/workflows/label-github-issues-by-path.yml index 5025edec8f4d..1b18f9ec5512 100644 --- a/.github/workflows/label-github-issues-by-path.yml +++ b/.github/workflows/label-github-issues-by-path.yml @@ -13,5 +13,5 @@ jobs: - name: "Label PR based on changed files" uses: actions/labeler@v3 with: - repo-token: "${{ secrets.GITHUB_TOKEN }}" + repo-token: "${{ secrets.GH_PAT_MAINTENANCE_OCTAVIA }}" sync-labels: true From 5f68d7c89a4c1d5c92fe7585a34836409c59a25e Mon Sep 17 00:00:00 2001 From: alafanechere Date: Fri, 13 Oct 2023 09:49:37 +0200 Subject: [PATCH 07/22] remove repo-token on labeler action to fallback to default --- .github/workflows/label-github-issues-by-path.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/label-github-issues-by-path.yml b/.github/workflows/label-github-issues-by-path.yml index 1b18f9ec5512..f106fa5ee45a 100644 --- a/.github/workflows/label-github-issues-by-path.yml +++ b/.github/workflows/label-github-issues-by-path.yml @@ -13,5 +13,4 @@ jobs: - name: "Label PR based on changed files" uses: actions/labeler@v3 with: - repo-token: "${{ secrets.GH_PAT_MAINTENANCE_OCTAVIA }}" sync-labels: true From 844b89de890130a2729c5c0a6d2304009292978d Mon Sep 17 00:00:00 2001 From: alafanechere Date: Fri, 13 Oct 2023 09:51:24 +0200 Subject: [PATCH 08/22] Revert "remove repo-token on labeler action to fallback to default" This reverts commit 5f68d7c89a4c1d5c92fe7585a34836409c59a25e. --- .github/workflows/label-github-issues-by-path.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/label-github-issues-by-path.yml b/.github/workflows/label-github-issues-by-path.yml index f106fa5ee45a..1b18f9ec5512 100644 --- a/.github/workflows/label-github-issues-by-path.yml +++ b/.github/workflows/label-github-issues-by-path.yml @@ -13,4 +13,5 @@ jobs: - name: "Label PR based on changed files" uses: actions/labeler@v3 with: + repo-token: "${{ secrets.GH_PAT_MAINTENANCE_OCTAVIA }}" sync-labels: true From e3569c114dda434ece14b4833b1277ff94451e9a Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 09:53:33 +0200 Subject: [PATCH 09/22] bump cdk version --- airbyte-integrations/connectors/destination-chroma/setup.py | 2 +- airbyte-integrations/connectors/destination-milvus/setup.py | 2 +- airbyte-integrations/connectors/destination-pinecone/setup.py | 2 +- airbyte-integrations/connectors/destination-qdrant/setup.py | 2 +- airbyte-integrations/connectors/destination-weaviate/setup.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/connectors/destination-chroma/setup.py b/airbyte-integrations/connectors/destination-chroma/setup.py index 513198c9c717..84ca194fb841 100644 --- a/airbyte-integrations/connectors/destination-chroma/setup.py +++ b/airbyte-integrations/connectors/destination-chroma/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.33", + "airbyte-cdk[vector-db-based]==0.51.34", "chromadb", ] diff --git a/airbyte-integrations/connectors/destination-milvus/setup.py b/airbyte-integrations/connectors/destination-milvus/setup.py index 6df8ecdfd819..2e6aff6516d7 100644 --- a/airbyte-integrations/connectors/destination-milvus/setup.py +++ b/airbyte-integrations/connectors/destination-milvus/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "pymilvus==2.3.0"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.34", "pymilvus==2.3.0"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-pinecone/setup.py b/airbyte-integrations/connectors/destination-pinecone/setup.py index 70a00165abca..8c2727e0c6dc 100644 --- a/airbyte-integrations/connectors/destination-pinecone/setup.py +++ b/airbyte-integrations/connectors/destination-pinecone/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.33", + "airbyte-cdk[vector-db-based]==0.51.34", "pinecone-client[grpc]", ] diff --git a/airbyte-integrations/connectors/destination-qdrant/setup.py b/airbyte-integrations/connectors/destination-qdrant/setup.py index 78112aae02b9..7e83c9ba7ad7 100644 --- a/airbyte-integrations/connectors/destination-qdrant/setup.py +++ b/airbyte-integrations/connectors/destination-qdrant/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "qdrant-client", "fastembed"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.34", "qdrant-client", "fastembed"] TEST_REQUIREMENTS = ["pytest~=6.2"] diff --git a/airbyte-integrations/connectors/destination-weaviate/setup.py b/airbyte-integrations/connectors/destination-weaviate/setup.py index f67c52daa740..cd9ac4d101d1 100644 --- a/airbyte-integrations/connectors/destination-weaviate/setup.py +++ b/airbyte-integrations/connectors/destination-weaviate/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.33", "weaviate-client==3.23.2"] +MAIN_REQUIREMENTS = ["airbyte-cdk[vector-db-based]==0.51.34", "weaviate-client==3.23.2"] TEST_REQUIREMENTS = ["pytest~=6.2", "docker", "pytest-docker"] From f1d2eebf27bd8e8ffbd24510a8121e718ec98e99 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 10:03:16 +0200 Subject: [PATCH 10/22] add support for namespaces --- .../destination_pinecone/indexer.py | 20 +++++------ .../unit_tests/pinecone_indexer_test.py | 34 +++++++++++-------- docs/integrations/destinations/pinecone.md | 2 +- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py index 5a2266971089..23779de870ef 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py @@ -38,30 +38,30 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog): self._pod_type = index_description.pod_type for stream in catalog.streams: if stream.destination_sync_mode == DestinationSyncMode.overwrite: - self.delete_vectors(filter={METADATA_STREAM_FIELD: stream.stream.name}) + self.delete_vectors(filter={METADATA_STREAM_FIELD: stream.stream.name}, namespace=stream.stream.namespace) def post_sync(self): return [] - def delete_vectors(self, filter): + def delete_vectors(self, filter, namespace = None): if self._pod_type == "starter": # Starter pod types have a maximum of 100000 rows top_k = 10000 - self.delete_by_metadata(filter, top_k) + self.delete_by_metadata(filter, top_k, namespace) else: - self.pinecone_index.delete(filter=filter) + self.pinecone_index.delete(filter=filter, namespace=namespace) - def delete_by_metadata(self, filter, top_k): + def delete_by_metadata(self, filter, top_k, namespace = None): zero_vector = [0.0] * self.embedding_dimensions - query_result = self.pinecone_index.query(vector=zero_vector, filter=filter, top_k=top_k) + query_result = self.pinecone_index.query(vector=zero_vector, filter=filter, top_k=top_k, namespace=namespace) while len(query_result.matches) > 0: vector_ids = [doc.id for doc in query_result.matches] if len(vector_ids) > 0: # split into chunks of 1000 ids to avoid id limit batches = create_chunks(vector_ids, batch_size=MAX_IDS_PER_DELETE) for batch in batches: - self.pinecone_index.delete(ids=list(batch)) - query_result = self.pinecone_index.query(vector=zero_vector, filter=filter, top_k=top_k) + self.pinecone_index.delete(ids=list(batch), namespace=namespace) + query_result = self.pinecone_index.query(vector=zero_vector, filter=filter, top_k=top_k, namespace=namespace) def _truncate_metadata(self, metadata: dict) -> dict: """ @@ -92,7 +92,7 @@ def index(self, document_chunks, namespace, stream): serial_batches = create_chunks(pinecone_docs, batch_size=PINECONE_BATCH_SIZE * PARALLELISM_LIMIT) for batch in serial_batches: async_results = [ - self.pinecone_index.upsert(vectors=ids_vectors_chunk, async_req=True, show_progress=False) + self.pinecone_index.upsert(vectors=ids_vectors_chunk, async_req=True, show_progress=False, namespace=namespace) for ids_vectors_chunk in create_chunks(batch, batch_size=PINECONE_BATCH_SIZE) ] # Wait for and retrieve responses (this raises in case of error) @@ -100,7 +100,7 @@ def index(self, document_chunks, namespace, stream): def delete(self, delete_ids, namespace, stream): if len(delete_ids) > 0: - self.delete_vectors(filter={METADATA_RECORD_ID_FIELD: {"$in": delete_ids}}) + self.delete_vectors(filter={METADATA_RECORD_ID_FIELD: {"$in": delete_ids}}, namespace=namespace) def check(self) -> Optional[str]: try: diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index 96a082d67e07..06c8e27f0260 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -51,15 +51,15 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): Mock(page_content="test", metadata={"_ab_stream": "abc"}, embedding=[1, 2, 3]), Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], - None, + "ns1", "some_stream" ) indexer.index( ["delete_id1", "delete_id2"], - None, + "ns1", "some_stram" ) - indexer.pinecone_index.delete.assert_called_with(filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}) + indexer.pinecone_index.delete.assert_called_with(filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, namespace="ns1") indexer.pinecone_index.upsert.assert_called_with( vectors=( (ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test"}), @@ -67,6 +67,7 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): ), async_req=True, show_progress=False, + namespace="ns1", ) @@ -83,18 +84,18 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): Mock(page_content="test", metadata={"_ab_stream": "abc"}, embedding=[1, 2, 3]), Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], - None, + "ns1", "some_stream" ) indexer.index( ["delete_id1", "delete_id2"], - None, + "ns1", "some_stram" ) indexer.pinecone_index.query.assert_called_with( - vector=[0, 0, 0], filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, top_k=10_000 + vector=[0, 0, 0], filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, top_k=10_000, namespace="ns1" ) - indexer.pinecone_index.delete.assert_has_calls([call(ids=["doc_id1", "doc_id2"]), call(ids=["doc_id3"])]) + indexer.pinecone_index.delete.assert_has_calls([call(ids=["doc_id1", "doc_id2"]), call(ids=["doc_id3"])], namespace="ns1") indexer.pinecone_index.upsert.assert_called_with( vectors=( (ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test"}), @@ -102,6 +103,7 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): ), async_req=True, show_progress=False, + namespace="ns1", ) @@ -114,11 +116,11 @@ def test_pinecone_index_delete_1k_limit(mock_describe_index): ] indexer.delete( ["delete_id1"], - None, + "ns1", "some_stream" ) indexer.pinecone_index.delete.assert_has_calls( - [call(ids=[f"doc_id_{str(i)}" for i in range(1000)]), call(ids=[f"doc_id_{str(i+1000)}" for i in range(300)])] + [call(ids=[f"doc_id_{str(i)}" for i in range(1000)], namespace="ns1"), call(ids=[f"doc_id_{str(i+1000)}" for i in range(300)], namespace="ns1")] ) @@ -126,7 +128,8 @@ def test_pinecone_index_empty_batch(): indexer = create_pinecone_indexer() indexer.index( [], - [], + "ns1", + "some_stream" ) indexer.pinecone_index.delete.assert_not_called() indexer.pinecone_index.upsert.assert_not_called() @@ -136,7 +139,7 @@ def test_pinecone_index_upsert_batching(): indexer = create_pinecone_indexer() indexer.index( [Mock(page_content=f"test {i}", metadata={"_ab_stream": "abc"}, embedding=[i, i, i]) for i in range(50)], - None, + "ns1", "some_stream", ) assert indexer.pinecone_index.upsert.call_count == 2 @@ -165,6 +168,7 @@ def generate_catalog(): "supported_sync_modes": ["full_refresh", "incremental"], "source_defined_cursor": False, "default_cursor_field": ["column_name"], + "namespace": "ns1" }, "primary_key": [["id"]], "sync_mode": "incremental", @@ -177,6 +181,7 @@ def generate_catalog(): "supported_sync_modes": ["full_refresh", "incremental"], "source_defined_cursor": False, "default_cursor_field": ["column_name"], + "namespace": "ns2" }, "primary_key": [["id"]], "sync_mode": "full_refresh", @@ -190,7 +195,7 @@ def generate_catalog(): def test_pinecone_pre_sync(mock_describe_index): indexer = create_pinecone_indexer() indexer.pre_sync(generate_catalog()) - indexer.pinecone_index.delete.assert_called_with(filter={"_ab_stream": "example_stream2"}) + indexer.pinecone_index.delete.assert_called_with(filter={"_ab_stream": "example_stream2"}, namespace="ns2") def test_pinecone_pre_sync_starter(mock_describe_index): @@ -201,8 +206,8 @@ def test_pinecone_pre_sync_starter(mock_describe_index): MagicMock(matches=[]), ] indexer.pre_sync(generate_catalog()) - indexer.pinecone_index.query.assert_called_with(vector=[0, 0, 0], filter={"_ab_stream": "example_stream2"}, top_k=10_000) - indexer.pinecone_index.delete.assert_called_with(ids=["doc_id1", "doc_id2"]) + indexer.pinecone_index.query.assert_called_with(vector=[0, 0, 0], filter={"_ab_stream": "example_stream2"}, top_k=10_000, namespace="ns2") + indexer.pinecone_index.delete.assert_called_with(ids=["doc_id1", "doc_id2"], namespace="ns2") @pytest.mark.parametrize( @@ -253,4 +258,5 @@ def test_metadata_normalization(): vectors=((ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test", "small": "a", "id": 1}),), async_req=True, show_progress=False, + namespace=None, ) diff --git a/docs/integrations/destinations/pinecone.md b/docs/integrations/destinations/pinecone.md index 54fb64f64ba9..483749a28e89 100644 --- a/docs/integrations/destinations/pinecone.md +++ b/docs/integrations/destinations/pinecone.md @@ -30,7 +30,7 @@ You'll need the following information to configure the destination: | Full Refresh Sync | Yes | | | Incremental - Append Sync | Yes | | | Incremental - Append + Deduped | Yes | Deleting records via CDC is not supported (see issue [#29827](https://github.com/airbytehq/airbyte/issues/29827)) | -| Namespaces | No | | +| Namespaces | Yes | | ## Data type mapping From ec1bc699c3ee6b9e1b4979e3d5f57b963b321306 Mon Sep 17 00:00:00 2001 From: alafanechere Date: Fri, 13 Oct 2023 10:05:02 +0200 Subject: [PATCH 11/22] Trigger CI From ef86321b65b3bb85695694aa76677cb9475d64e3 Mon Sep 17 00:00:00 2001 From: flash1293 Date: Fri, 13 Oct 2023 08:17:20 +0000 Subject: [PATCH 12/22] Automated Commit - Formatting Changes --- .../unit_tests/indexer_test.py | 4 +- .../destination_pinecone/indexer.py | 4 +- .../unit_tests/pinecone_indexer_test.py | 41 +++++++------------ 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py index 0679a26a960b..009833703374 100644 --- a/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py @@ -142,7 +142,9 @@ def test_pre_sync_does_not_call_delete(self): def test_index_calls_insert(self): self.milvus_indexer._primary_key = "id" - self.milvus_indexer.index([Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1, 2, 3])], None, "some_stream") + self.milvus_indexer.index( + [Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1, 2, 3])], None, "some_stream" + ) self.milvus_indexer._collection.insert.assert_called_with([{"key": "value", "vector": [1, 2, 3], "text": "some content", "_id": 5}]) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py index 23779de870ef..94cb466df384 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py @@ -43,7 +43,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog): def post_sync(self): return [] - def delete_vectors(self, filter, namespace = None): + def delete_vectors(self, filter, namespace=None): if self._pod_type == "starter": # Starter pod types have a maximum of 100000 rows top_k = 10000 @@ -51,7 +51,7 @@ def delete_vectors(self, filter, namespace = None): else: self.pinecone_index.delete(filter=filter, namespace=namespace) - def delete_by_metadata(self, filter, top_k, namespace = None): + def delete_by_metadata(self, filter, top_k, namespace=None): zero_vector = [0.0] * self.embedding_dimensions query_result = self.pinecone_index.query(vector=zero_vector, filter=filter, top_k=top_k, namespace=namespace) while len(query_result.matches) > 0: diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index 06c8e27f0260..79def35a306b 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -52,13 +52,9 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], "ns1", - "some_stream" - ) - indexer.index( - ["delete_id1", "delete_id2"], - "ns1", - "some_stram" + "some_stream", ) + indexer.index(["delete_id1", "delete_id2"], "ns1", "some_stram") indexer.pinecone_index.delete.assert_called_with(filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, namespace="ns1") indexer.pinecone_index.upsert.assert_called_with( vectors=( @@ -85,13 +81,9 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], "ns1", - "some_stream" - ) - indexer.index( - ["delete_id1", "delete_id2"], - "ns1", - "some_stram" + "some_stream", ) + indexer.index(["delete_id1", "delete_id2"], "ns1", "some_stram") indexer.pinecone_index.query.assert_called_with( vector=[0, 0, 0], filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, top_k=10_000, namespace="ns1" ) @@ -114,23 +106,18 @@ def test_pinecone_index_delete_1k_limit(mock_describe_index): MagicMock(matches=[MagicMock(id=f"doc_id_{str(i)}") for i in range(1300)]), MagicMock(matches=[]), ] - indexer.delete( - ["delete_id1"], - "ns1", - "some_stream" - ) + indexer.delete(["delete_id1"], "ns1", "some_stream") indexer.pinecone_index.delete.assert_has_calls( - [call(ids=[f"doc_id_{str(i)}" for i in range(1000)], namespace="ns1"), call(ids=[f"doc_id_{str(i+1000)}" for i in range(300)], namespace="ns1")] + [ + call(ids=[f"doc_id_{str(i)}" for i in range(1000)], namespace="ns1"), + call(ids=[f"doc_id_{str(i+1000)}" for i in range(300)], namespace="ns1"), + ] ) def test_pinecone_index_empty_batch(): indexer = create_pinecone_indexer() - indexer.index( - [], - "ns1", - "some_stream" - ) + indexer.index([], "ns1", "some_stream") indexer.pinecone_index.delete.assert_not_called() indexer.pinecone_index.upsert.assert_not_called() @@ -168,7 +155,7 @@ def generate_catalog(): "supported_sync_modes": ["full_refresh", "incremental"], "source_defined_cursor": False, "default_cursor_field": ["column_name"], - "namespace": "ns1" + "namespace": "ns1", }, "primary_key": [["id"]], "sync_mode": "incremental", @@ -181,7 +168,7 @@ def generate_catalog(): "supported_sync_modes": ["full_refresh", "incremental"], "source_defined_cursor": False, "default_cursor_field": ["column_name"], - "namespace": "ns2" + "namespace": "ns2", }, "primary_key": [["id"]], "sync_mode": "full_refresh", @@ -206,7 +193,9 @@ def test_pinecone_pre_sync_starter(mock_describe_index): MagicMock(matches=[]), ] indexer.pre_sync(generate_catalog()) - indexer.pinecone_index.query.assert_called_with(vector=[0, 0, 0], filter={"_ab_stream": "example_stream2"}, top_k=10_000, namespace="ns2") + indexer.pinecone_index.query.assert_called_with( + vector=[0, 0, 0], filter={"_ab_stream": "example_stream2"}, top_k=10_000, namespace="ns2" + ) indexer.pinecone_index.delete.assert_called_with(ids=["doc_id1", "doc_id2"], namespace="ns2") From 3693b27978408679777eb49549db7b5c1f4fcd8b Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 10:17:25 +0200 Subject: [PATCH 13/22] fix tests --- .../destination_pinecone/destination.py | 1 + .../unit_tests/destination_test.py | 20 +++++++++---------- .../unit_tests/pinecone_indexer_test.py | 7 ++++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py index 1b5a613ad4d7..44bb48a7becd 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py @@ -32,6 +32,7 @@ def write( config_model = ConfigModel.parse_obj(config) self._init_indexer(config_model) writer = Writer(config_model.processing, self.indexer, self.embedder, batch_size=BATCH_SIZE) + print(writer) yield from writer.write(configured_catalog, input_messages) def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/destination_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/destination_test.py index 2fb198c49263..4509b970e1c9 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/destination_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/destination_test.py @@ -8,7 +8,7 @@ from airbyte_cdk import AirbyteLogger from airbyte_cdk.models import ConnectorSpecification, Status from destination_pinecone.config import ConfigModel -from destination_pinecone.destination import DestinationPinecone, embedder_map +from destination_pinecone.destination import DestinationPinecone class TestDestinationPinecone(unittest.TestCase): @@ -26,11 +26,11 @@ def setUp(self): self.logger = AirbyteLogger() @patch("destination_pinecone.destination.PineconeIndexer") - @patch.dict(embedder_map, openai=MagicMock()) - def test_check(self, MockedPineconeIndexer): + @patch("destination_pinecone.destination.create_from_config") + def test_check(self, MockedEmbedder, MockedPineconeIndexer): mock_embedder = Mock() mock_indexer = Mock() - embedder_map["openai"].return_value = mock_embedder + MockedEmbedder.return_value = mock_embedder MockedPineconeIndexer.return_value = mock_indexer mock_embedder.check.return_value = None @@ -44,11 +44,11 @@ def test_check(self, MockedPineconeIndexer): mock_indexer.check.assert_called_once() @patch("destination_pinecone.destination.PineconeIndexer") - @patch.dict(embedder_map, openai=MagicMock()) - def test_check_with_errors(self, MockedPineconeIndexer): + @patch("destination_pinecone.destination.create_from_config") + def test_check_with_errors(self, MockedEmbedder, MockedPineconeIndexer): mock_embedder = Mock() mock_indexer = Mock() - embedder_map["openai"].return_value = mock_embedder + MockedEmbedder.return_value = mock_embedder MockedPineconeIndexer.return_value = mock_indexer embedder_error_message = "Embedder Error" @@ -68,13 +68,13 @@ def test_check_with_errors(self, MockedPineconeIndexer): @patch("destination_pinecone.destination.Writer") @patch("destination_pinecone.destination.PineconeIndexer") - @patch.dict(embedder_map, openai=MagicMock()) - def test_write(self, MockedPineconeIndexer, MockedWriter): + @patch("destination_pinecone.destination.create_from_config") + def test_write(self, MockedEmbedder, MockedPineconeIndexer, MockedWriter): mock_embedder = Mock() mock_indexer = Mock() + MockedEmbedder.return_value = mock_embedder mock_writer = Mock() - embedder_map["openai"].return_value = mock_embedder MockedPineconeIndexer.return_value = mock_indexer MockedWriter.return_value = mock_writer diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index 96a082d67e07..257c36ca5bf1 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -54,7 +54,7 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): None, "some_stream" ) - indexer.index( + indexer.delete( ["delete_id1", "delete_id2"], None, "some_stram" @@ -86,7 +86,7 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): None, "some_stream" ) - indexer.index( + indexer.delete( ["delete_id1", "delete_id2"], None, "some_stram" @@ -126,7 +126,8 @@ def test_pinecone_index_empty_batch(): indexer = create_pinecone_indexer() indexer.index( [], - [], + None, + "some_stream" ) indexer.pinecone_index.delete.assert_not_called() indexer.pinecone_index.upsert.assert_not_called() From 3348771dc6745e7570ef1c6a72bdcd9a10344033 Mon Sep 17 00:00:00 2001 From: flash1293 Date: Fri, 13 Oct 2023 08:42:44 +0000 Subject: [PATCH 14/22] Automated Commit - Formatting Changes --- .../unit_tests/pinecone_indexer_test.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index a016cabc6ab8..626ab35ad401 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -52,13 +52,9 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index): Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], "ns1", - "some_stream" - ) - indexer.delete( - ["delete_id1", "delete_id2"], - "ns1", - "some_stram" + "some_stream", ) + indexer.delete(["delete_id1", "delete_id2"], "ns1", "some_stram") indexer.pinecone_index.delete.assert_called_with(filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, namespace="ns1") indexer.pinecone_index.upsert.assert_called_with( vectors=( @@ -85,17 +81,15 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index): Mock(page_content="test2", metadata={"_ab_stream": "abc"}, embedding=[4, 5, 6]), ], "ns1", - "some_stream" - ) - indexer.delete( - ["delete_id1", "delete_id2"], - "ns1", - "some_stram" + "some_stream", ) + indexer.delete(["delete_id1", "delete_id2"], "ns1", "some_stram") indexer.pinecone_index.query.assert_called_with( vector=[0, 0, 0], filter={"_ab_record_id": {"$in": ["delete_id1", "delete_id2"]}}, top_k=10_000, namespace="ns1" ) - indexer.pinecone_index.delete.assert_has_calls([call(ids=["doc_id1", "doc_id2"], namespace="ns1"), call(ids=["doc_id3"], namespace="ns1")]) + indexer.pinecone_index.delete.assert_has_calls( + [call(ids=["doc_id1", "doc_id2"], namespace="ns1"), call(ids=["doc_id3"], namespace="ns1")] + ) indexer.pinecone_index.upsert.assert_called_with( vectors=( (ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test"}), From b35a0b3663ce7e2cb9e8f65c171a5f06b57d7598 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Wed, 18 Oct 2023 17:08:26 +0200 Subject: [PATCH 15/22] prepare release --- airbyte-integrations/connectors/destination-pinecone/Dockerfile | 2 +- .../connectors/destination-pinecone/metadata.yaml | 2 +- docs/integrations/destinations/pinecone.md | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/Dockerfile b/airbyte-integrations/connectors/destination-pinecone/Dockerfile index c010916dcde3..4defea821888 100644 --- a/airbyte-integrations/connectors/destination-pinecone/Dockerfile +++ b/airbyte-integrations/connectors/destination-pinecone/Dockerfile @@ -38,6 +38,6 @@ COPY destination_pinecone ./destination_pinecone ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.0.16 +LABEL io.airbyte.version=0.0.17 LABEL io.airbyte.name=airbyte/destination-pinecone diff --git a/airbyte-integrations/connectors/destination-pinecone/metadata.yaml b/airbyte-integrations/connectors/destination-pinecone/metadata.yaml index acba2b7189e9..c043097a834e 100644 --- a/airbyte-integrations/connectors/destination-pinecone/metadata.yaml +++ b/airbyte-integrations/connectors/destination-pinecone/metadata.yaml @@ -20,7 +20,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 3d2b6f84-7f0d-4e3f-a5e5-7c7d4b50eabd - dockerImageTag: 0.0.16 + dockerImageTag: 0.0.17 dockerRepository: airbyte/destination-pinecone githubIssueLabel: destination-pinecone icon: pinecone.svg diff --git a/docs/integrations/destinations/pinecone.md b/docs/integrations/destinations/pinecone.md index 483749a28e89..fa4f62644ab7 100644 --- a/docs/integrations/destinations/pinecone.md +++ b/docs/integrations/destinations/pinecone.md @@ -74,6 +74,7 @@ OpenAI and Fake embeddings produce vectors with 1536 dimensions, and the Cohere | Version | Date | Pull Request | Subject | |:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.0.17 | 2023-10-20 | [#31329](https://github.com/airbytehq/airbyte/pull/31373) | Add support for namespaces | | 0.0.16 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.15 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.14 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | From 58b34b92c4c39a67d920c99ac05c80fbf8ca9bb7 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Thu, 19 Oct 2023 16:00:47 +0200 Subject: [PATCH 16/22] fix namespace clearing error --- .../destination-pinecone/destination_pinecone/indexer.py | 4 ++-- airbyte-integrations/connectors/destination-pinecone/setup.py | 2 +- docs/integrations/destinations/pinecone.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py index 94cb466df384..8ebe2bc3e45e 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py @@ -8,7 +8,7 @@ import pinecone from airbyte_cdk.destinations.vector_db_based.document_processor import METADATA_RECORD_ID_FIELD, METADATA_STREAM_FIELD from airbyte_cdk.destinations.vector_db_based.indexer import Indexer -from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception +from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception, create_stream_identifier from airbyte_cdk.models.airbyte_protocol import ConfiguredAirbyteCatalog, DestinationSyncMode from destination_pinecone.config import PineconeIndexingModel @@ -38,7 +38,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog): self._pod_type = index_description.pod_type for stream in catalog.streams: if stream.destination_sync_mode == DestinationSyncMode.overwrite: - self.delete_vectors(filter={METADATA_STREAM_FIELD: stream.stream.name}, namespace=stream.stream.namespace) + self.delete_vectors(filter={METADATA_STREAM_FIELD: create_stream_identifier(stream.stream)}, namespace=stream.stream.namespace) def post_sync(self): return [] diff --git a/airbyte-integrations/connectors/destination-pinecone/setup.py b/airbyte-integrations/connectors/destination-pinecone/setup.py index 8c2727e0c6dc..8d8f8fc00019 100644 --- a/airbyte-integrations/connectors/destination-pinecone/setup.py +++ b/airbyte-integrations/connectors/destination-pinecone/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk[vector-db-based]==0.51.34", + "airbyte-cdk[vector-db-based]==0.51.41", "pinecone-client[grpc]", ] diff --git a/docs/integrations/destinations/pinecone.md b/docs/integrations/destinations/pinecone.md index fa4f62644ab7..44b0ecbd3215 100644 --- a/docs/integrations/destinations/pinecone.md +++ b/docs/integrations/destinations/pinecone.md @@ -74,7 +74,7 @@ OpenAI and Fake embeddings produce vectors with 1536 dimensions, and the Cohere | Version | Date | Pull Request | Subject | |:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| -| 0.0.17 | 2023-10-20 | [#31329](https://github.com/airbytehq/airbyte/pull/31373) | Add support for namespaces | +| 0.0.17 | 2023-10-20 | [#31329](https://github.com/airbytehq/airbyte/pull/31373) | Add support for namespaces and fix index cleaning when namespace is defined | | 0.0.16 | 2023-10-15 | [#31329](https://github.com/airbytehq/airbyte/pull/31329) | Add OpenAI-compatible embedder option | | 0.0.15 | 2023-10-04 | [#31075](https://github.com/airbytehq/airbyte/pull/31075) | Fix OpenAI embedder batch size | | 0.0.14 | 2023-09-29 | [#30820](https://github.com/airbytehq/airbyte/pull/30820) | Update CDK | From 2b0da7482083fd02c3474800fe574a671bf3d180 Mon Sep 17 00:00:00 2001 From: flash1293 Date: Thu, 19 Oct 2023 14:34:40 +0000 Subject: [PATCH 17/22] Automated Commit - Formatting Changes --- .../destination-pinecone/destination_pinecone/indexer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py index 8ebe2bc3e45e..92c64d676b0e 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py @@ -8,7 +8,7 @@ import pinecone from airbyte_cdk.destinations.vector_db_based.document_processor import METADATA_RECORD_ID_FIELD, METADATA_STREAM_FIELD from airbyte_cdk.destinations.vector_db_based.indexer import Indexer -from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception, create_stream_identifier +from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, create_stream_identifier, format_exception from airbyte_cdk.models.airbyte_protocol import ConfiguredAirbyteCatalog, DestinationSyncMode from destination_pinecone.config import PineconeIndexingModel @@ -38,7 +38,9 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog): self._pod_type = index_description.pod_type for stream in catalog.streams: if stream.destination_sync_mode == DestinationSyncMode.overwrite: - self.delete_vectors(filter={METADATA_STREAM_FIELD: create_stream_identifier(stream.stream)}, namespace=stream.stream.namespace) + self.delete_vectors( + filter={METADATA_STREAM_FIELD: create_stream_identifier(stream.stream)}, namespace=stream.stream.namespace + ) def post_sync(self): return [] From a3430985a9c952d31d44a2dba7660c99a6dfc076 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 20 Oct 2023 10:15:13 +0200 Subject: [PATCH 18/22] fix tests --- .../destination-pinecone/unit_tests/pinecone_indexer_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py index 626ab35ad401..c6f50424ade6 100644 --- a/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py +++ b/airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py @@ -184,7 +184,7 @@ def generate_catalog(): def test_pinecone_pre_sync(mock_describe_index): indexer = create_pinecone_indexer() indexer.pre_sync(generate_catalog()) - indexer.pinecone_index.delete.assert_called_with(filter={"_ab_stream": "example_stream2"}, namespace="ns2") + indexer.pinecone_index.delete.assert_called_with(filter={"_ab_stream": "ns2_example_stream2"}, namespace="ns2") def test_pinecone_pre_sync_starter(mock_describe_index): @@ -196,7 +196,7 @@ def test_pinecone_pre_sync_starter(mock_describe_index): ] indexer.pre_sync(generate_catalog()) indexer.pinecone_index.query.assert_called_with( - vector=[0, 0, 0], filter={"_ab_stream": "example_stream2"}, top_k=10_000, namespace="ns2" + vector=[0, 0, 0], filter={"_ab_stream": "ns2_example_stream2"}, top_k=10_000, namespace="ns2" ) indexer.pinecone_index.delete.assert_called_with(ids=["doc_id1", "doc_id2"], namespace="ns2") From 4a70c1a950f59b144a9ae71f5a14fa86f340d956 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 20 Oct 2023 11:09:16 +0200 Subject: [PATCH 19/22] remove dockerfile again --- .../destination-pinecone/Dockerfile | 43 ------------------- 1 file changed, 43 deletions(-) delete mode 100644 airbyte-integrations/connectors/destination-pinecone/Dockerfile diff --git a/airbyte-integrations/connectors/destination-pinecone/Dockerfile b/airbyte-integrations/connectors/destination-pinecone/Dockerfile deleted file mode 100644 index 4defea821888..000000000000 --- a/airbyte-integrations/connectors/destination-pinecone/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -FROM python:3.10-slim as base - -# build and load all requirements -FROM base as builder -WORKDIR /airbyte/integration_code - -# upgrade pip to the latest version - -COPY setup.py ./ - -RUN pip install --upgrade pip - -# This is required because the current connector dependency is not compatible with the CDK version -# An older CDK version will be used, which depends on pyYAML 5.4, for which we need to pin Cython to <3.0 -# As of today the CDK version that satisfies the main dependency requirements, is 0.1.80 ... -RUN pip install --prefix=/install "Cython<3.0" "pyyaml~=5.4" --no-build-isolation - -# install necessary packages to a temporary folder -RUN pip install --prefix=/install . - -# build a clean environment -FROM base -WORKDIR /airbyte/integration_code - -# copy all loaded and built libraries to a pure basic image -COPY --from=builder /install /usr/local -# add default timezone settings -COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime -RUN echo "Etc/UTC" > /etc/timezone - -# bash is installed for more convenient debugging. -RUN apt-get install bash - -# copy payload code only -COPY main.py ./ -COPY destination_pinecone ./destination_pinecone - -ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" -ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] - -LABEL io.airbyte.version=0.0.17 - -LABEL io.airbyte.name=airbyte/destination-pinecone From f75b0471e05fc7e45c8431f8eb3cf585c146ad5f Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 20 Oct 2023 11:15:46 +0200 Subject: [PATCH 20/22] fix tests --- .../integration_tests/spec.json | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json index 31e76e8d43b3..b48548f721de 100644 --- a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json @@ -199,6 +199,32 @@ "type": "array", "items": { "type": "string" } }, + "field_name_mappings": { + "title": "Field name mappings", + "description": "List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.", + "default": [], + "type": "array", + "items": { + "title": "FieldNameMappingConfigModel", + "type": "object", + "properties": { + "from_field": { + "title": "From field name", + "description": "The field name in the source", + "type": "string" + }, + "to_field": { + "title": "To field name", + "description": "The field name to use in the destination", + "type": "string" + } + }, + "required": [ + "from_field", + "to_field" + ] + } + }, "text_splitter": { "title": "Text splitter", "description": "Split text fields into chunks based on the specified method.", From 498abbffb0aab71b2086885b15e4deb1bef50fa1 Mon Sep 17 00:00:00 2001 From: flash1293 Date: Fri, 20 Oct 2023 09:43:37 +0000 Subject: [PATCH 21/22] Automated Commit - Formatting Changes --- .../destination-pinecone/integration_tests/spec.json | 5 +---- .../connectors/source-paypal-transaction/metadata.yaml | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json index b48548f721de..22873f556522 100644 --- a/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-pinecone/integration_tests/spec.json @@ -219,10 +219,7 @@ "type": "string" } }, - "required": [ - "from_field", - "to_field" - ] + "required": ["from_field", "to_field"] } }, "text_splitter": { diff --git a/airbyte-integrations/connectors/source-paypal-transaction/metadata.yaml b/airbyte-integrations/connectors/source-paypal-transaction/metadata.yaml index 3f8d1079e504..0e8df098b323 100644 --- a/airbyte-integrations/connectors/source-paypal-transaction/metadata.yaml +++ b/airbyte-integrations/connectors/source-paypal-transaction/metadata.yaml @@ -20,7 +20,7 @@ data: name: Paypal Transaction registries: cloud: - dockerImageTag: 2.0.0 #https://github.com/airbytehq/oncall/issues/3347 + dockerImageTag: 2.0.0 #https://github.com/airbytehq/oncall/issues/3347 enabled: true oss: enabled: true From ee14f88841cbe5d17171c8946b19ea3a76fcde52 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 20 Oct 2023 12:24:30 +0200 Subject: [PATCH 22/22] Update destination.py --- .../destination-pinecone/destination_pinecone/destination.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py index 44bb48a7becd..1b5a613ad4d7 100644 --- a/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py +++ b/airbyte-integrations/connectors/destination-pinecone/destination_pinecone/destination.py @@ -32,7 +32,6 @@ def write( config_model = ConfigModel.parse_obj(config) self._init_indexer(config_model) writer = Writer(config_model.processing, self.indexer, self.embedder, batch_size=BATCH_SIZE) - print(writer) yield from writer.write(configured_catalog, input_messages) def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: