From 56879a9399c716f00e8c3e929e76c431bb7d0fd6 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Oct 2023 11:04:42 +0200 Subject: [PATCH 1/7] Source Apify Dataset: add item_collection stream with dynamic schema --- .../source_apify_dataset/manifest.yaml | 16 +++++++++++ .../schemas/item_collection.json | 12 +++++++++ .../wrapping_dpath_extractor.py | 27 +++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json create mode 100644 airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml index 600dfa99356a..820c0f505c63 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml @@ -107,9 +107,25 @@ streams: type: DpathExtractor field_path: [] + - type: DeclarativeStream + name: item_collection + $parameters: + path: "datasets/{{ config['dataset_id'] }}/items" + schema_loader: + type: JsonFileSchemaLoader + file_path: "./source_apify_dataset/schemas/item_collection" + retriever: + $ref: "#/definitions/retriever" + record_selector: + type: RecordSelector + extractor: + class_name: source_apify_dataset.wrapping_dpath_extractor.WrappingDpathExtractor + field_path: ["data"] + check: type: CheckStream stream_names: - dataset_collection - dataset - item_collection_website_content_crawler + - item_collection diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json new file mode 100644 index 000000000000..5ceff1848c55 --- /dev/null +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json @@ -0,0 +1,12 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Item collection", + "type": ["null", "object"], + "additionalProperties": true, + "properties": { + "data": { + "additionalProperties": true, + "type": ["null", "object"] + } + } +} diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py new file mode 100644 index 000000000000..1d58c46cd654 --- /dev/null +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, Mapping, Union + +import dpath.util +import requests +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.types import Record + + +@dataclass +class WrappingDpathExtractor(DpathExtractor): + """ + Record extractor that wraps the extracted value into a dict, with the value being set to the key `data`. + This is done because the actual shape of the data is dynamic, so by wrapping everything into a `data` object + it can be specified as a generic object in the schema. + + Note that this will cause fields to not be normalized in the destination. + """ + + def extract_records(self, response: requests.Response) -> list[Record]: + records = super().extract_records(response) + return [{"data": record} for record in records] From 312461efd15a75bdafd535ddfed3c7e95538116b Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Thu, 12 Oct 2023 16:01:52 +0200 Subject: [PATCH 2/7] fix small issues --- .../source-apify-dataset/source_apify_dataset/manifest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml index 820c0f505c63..1d2bd898809d 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml @@ -113,14 +113,14 @@ streams: path: "datasets/{{ config['dataset_id'] }}/items" schema_loader: type: JsonFileSchemaLoader - file_path: "./source_apify_dataset/schemas/item_collection" + file_path: "./source_apify_dataset/schemas/item_collection.json" retriever: $ref: "#/definitions/retriever" record_selector: type: RecordSelector extractor: class_name: source_apify_dataset.wrapping_dpath_extractor.WrappingDpathExtractor - field_path: ["data"] + field_path: [] check: type: CheckStream From 9860b90d99f08f48e3939f2f7c4382a16650cc24 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Oct 2023 11:04:42 +0200 Subject: [PATCH 3/7] Source Apify Dataset: add item_collection stream with dynamic schema --- .../source-apify-dataset/Dockerfile | 2 +- .../source-apify-dataset/metadata.yaml | 5 +++- .../source_apify_dataset/manifest.yaml | 16 +++++++++++ .../schemas/item_collection.json | 12 +++++++++ .../wrapping_dpath_extractor.py | 27 +++++++++++++++++++ .../sources/apify-dataset-migrations.md | 6 ++++- docs/integrations/sources/apify-dataset.md | 10 +++++-- 7 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json create mode 100644 airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py diff --git a/airbyte-integrations/connectors/source-apify-dataset/Dockerfile b/airbyte-integrations/connectors/source-apify-dataset/Dockerfile index e83b72f6ecf0..c51f4d752c98 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/Dockerfile +++ b/airbyte-integrations/connectors/source-apify-dataset/Dockerfile @@ -34,5 +34,5 @@ COPY source_apify_dataset ./source_apify_dataset ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=2.0.0 +LABEL io.airbyte.version=2.1.0 LABEL io.airbyte.name=airbyte/source-apify-dataset diff --git a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml index 3c165d9154d0..c58893c29fac 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml @@ -11,7 +11,7 @@ data: connectorSubtype: api connectorType: source definitionId: 47f17145-fe20-4ef5-a548-e29b048adf84 - dockerImageTag: 2.0.0 + dockerImageTag: 2.1.0 dockerRepository: airbyte/source-apify-dataset githubIssueLabel: source-apify-dataset icon: apify-dataset.svg @@ -27,6 +27,9 @@ data: 2.0.0: upgradeDeadline: 2023-09-18 message: "Fix broken stream, manifest refactor" + 2.1.0: + upgradeDeadline: 2023-10-13 + message: "Add a new stream item_collection" supportLevel: community documentationUrl: https://docs.airbyte.com/integrations/sources/apify-dataset tags: diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml index 600dfa99356a..1d2bd898809d 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml @@ -107,9 +107,25 @@ streams: type: DpathExtractor field_path: [] + - type: DeclarativeStream + name: item_collection + $parameters: + path: "datasets/{{ config['dataset_id'] }}/items" + schema_loader: + type: JsonFileSchemaLoader + file_path: "./source_apify_dataset/schemas/item_collection.json" + retriever: + $ref: "#/definitions/retriever" + record_selector: + type: RecordSelector + extractor: + class_name: source_apify_dataset.wrapping_dpath_extractor.WrappingDpathExtractor + field_path: [] + check: type: CheckStream stream_names: - dataset_collection - dataset - item_collection_website_content_crawler + - item_collection diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json new file mode 100644 index 000000000000..5ceff1848c55 --- /dev/null +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/schemas/item_collection.json @@ -0,0 +1,12 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Item collection", + "type": ["null", "object"], + "additionalProperties": true, + "properties": { + "data": { + "additionalProperties": true, + "type": ["null", "object"] + } + } +} diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py new file mode 100644 index 000000000000..1d58c46cd654 --- /dev/null +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, Mapping, Union + +import dpath.util +import requests +from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.types import Record + + +@dataclass +class WrappingDpathExtractor(DpathExtractor): + """ + Record extractor that wraps the extracted value into a dict, with the value being set to the key `data`. + This is done because the actual shape of the data is dynamic, so by wrapping everything into a `data` object + it can be specified as a generic object in the schema. + + Note that this will cause fields to not be normalized in the destination. + """ + + def extract_records(self, response: requests.Response) -> list[Record]: + records = super().extract_records(response) + return [{"data": record} for record in records] diff --git a/docs/integrations/sources/apify-dataset-migrations.md b/docs/integrations/sources/apify-dataset-migrations.md index e2c4a948a077..dbc7e63056d2 100644 --- a/docs/integrations/sources/apify-dataset-migrations.md +++ b/docs/integrations/sources/apify-dataset-migrations.md @@ -1,5 +1,9 @@ # Apify Dataset Migration Guide +## Upgrading to 2.1.0 + +A minor update adding a new stream `item_collection` for general datasets. No actions are required regarding your current connector configuration setup. + ## Upgrading to 2.0.0 Major update: The old broken Item Collection stream has been removed and replaced with a new Item Collection (WCC) stream specific for the datasets produced by [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor. Please update your connector configuration setup. Note: The schema of the Apify Dataset is at least Actor-specific, so we cannot have a general Stream with a static schema for getting data from a Dataset. @@ -7,4 +11,4 @@ Major update: The old broken Item Collection stream has been removed and replace ## Upgrading to 1.0.0 A major update fixing the data ingestion to retrieve properly data from Apify. -Please update your connector configuration setup. \ No newline at end of file +Please update your connector configuration setup. diff --git a/docs/integrations/sources/apify-dataset.md b/docs/integrations/sources/apify-dataset.md index 69f060a2ab12..e073be81372c 100644 --- a/docs/integrations/sources/apify-dataset.md +++ b/docs/integrations/sources/apify-dataset.md @@ -46,14 +46,20 @@ The Apify dataset connector uses [Apify Python Client](https://docs.apify.com/ap - Apify Personal API token (you can find it [here](https://console.apify.com/account/integrations)) - Dataset ID (check the [docs](https://docs.apify.com/platform/storage/dataset)) -### `item_collection_website_content_crawler` +### `item_collection` - Calls `api.apify.com/v2/datasets/{datasetId}/items` ([docs](https://docs.apify.com/api/v2#/reference/datasets/item-collection/get-items)) - Properties: - Apify Personal API token (you can find it [here](https://console.apify.com/account/integrations)) - Dataset ID (check the [docs](https://docs.apify.com/platform/storage/dataset)) - Limitations: - - Currently works only for the datasets produced by [Website Content Crawler](https://apify.com/apify/website-content-crawler). + - The stream uses a dynamic schema (all the data are stored under the `"data"` key), so it should support all the Apify Datasets (produced by whatever Actor). + +### `item_collection_website_content_crawler` + +- Calls the same endpoint and uses the same properties as the `item_collection` stream. +- Limitations: + - The stream uses a static schema which corresponds to the datasets produced by [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor. So only datasets produced by this Actor are supported. ## Changelog From 061f622f46ed311eccc10994ed8fd0b11a4903a9 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 12:51:09 +0200 Subject: [PATCH 4/7] fix icon --- .../connectors/source-apify-dataset/metadata.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml index c58893c29fac..5c939a9331de 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml @@ -14,7 +14,7 @@ data: dockerImageTag: 2.1.0 dockerRepository: airbyte/source-apify-dataset githubIssueLabel: source-apify-dataset - icon: apify-dataset.svg + icon: apify.svg license: MIT name: Apify Dataset releaseDate: 2023-08-25 From 1f70a5b0f843f8cac134f4d090144e5480863b1c Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 13:05:12 +0200 Subject: [PATCH 5/7] fix build --- .../source_apify_dataset/wrapping_dpath_extractor.py | 3 --- docs/integrations/sources/apify-dataset.md | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py index 1d58c46cd654..0fea713e975a 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py +++ b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/wrapping_dpath_extractor.py @@ -3,12 +3,9 @@ # from dataclasses import dataclass -from typing import Any, Mapping, Union -import dpath.util import requests from airbyte_cdk.sources.declarative.extractors.dpath_extractor import DpathExtractor -from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString from airbyte_cdk.sources.declarative.types import Record diff --git a/docs/integrations/sources/apify-dataset.md b/docs/integrations/sources/apify-dataset.md index e073be81372c..d672cced7c9b 100644 --- a/docs/integrations/sources/apify-dataset.md +++ b/docs/integrations/sources/apify-dataset.md @@ -65,6 +65,7 @@ The Apify dataset connector uses [Apify Python Client](https://docs.apify.com/ap | Version | Date | Pull Request | Subject | | :------ | :--------- | :----------------------------------------------------------- | :-------------------------------------------------------------------------- | +| 2.1.0 | 2023-10-13 | [31333](https://github.com/airbytehq/airbyte/pull/31333) | Add stream for arbitrary datasets | | 2.0.0 | 2023-09-18 | [30428](https://github.com/airbytehq/airbyte/pull/30428) | Fix broken stream, manifest refactor | | 1.0.0 | 2023-08-25 | [29859](https://github.com/airbytehq/airbyte/pull/29859) | Migrate to lowcode | | 0.2.0 | 2022-06-20 | [28290](https://github.com/airbytehq/airbyte/pull/28290) | Make connector work with platform changes not syncing empty stream schemas. | From 19fc17144ee0541eec0777e34459f286c5f9693f Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 13:14:31 +0200 Subject: [PATCH 6/7] Trigger Build From a298adee8762a17ae37ff36e313563834916ce02 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Fri, 13 Oct 2023 14:44:51 +0200 Subject: [PATCH 7/7] adjust metadata.yml --- .../connectors/source-apify-dataset/metadata.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml index 5c939a9331de..70a8c083a8b7 100644 --- a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml +++ b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml @@ -7,7 +7,6 @@ data: enabled: true cloud: enabled: true - dockerImageTag: 0.2.0 # https://github.com/airbytehq/airbyte/issues/30478 connectorSubtype: api connectorType: source definitionId: 47f17145-fe20-4ef5-a548-e29b048adf84 @@ -28,8 +27,8 @@ data: upgradeDeadline: 2023-09-18 message: "Fix broken stream, manifest refactor" 2.1.0: - upgradeDeadline: 2023-10-13 - message: "Add a new stream item_collection" + upgradeDeadline: 2023-10-27 + message: "Rename dataset streams" supportLevel: community documentationUrl: https://docs.airbyte.com/integrations/sources/apify-dataset tags: