Source Apify Dataset: fix broken stream, manifest refactor (#30428)

Co-authored-by: Joe Reuter <[email protected]> Co-authored-by: flash1293 <[email protected]>
airbytehq · Oct 6, 2023 · 7bff33f · 7bff33f
1 parent c3a9a4f
commit 7bff33f
Show file tree

Hide file tree

Showing 14 changed files with 262 additions and 166 deletions.
diff --git a/airbyte-integrations/connectors/source-apify-dataset/Dockerfile b/airbyte-integrations/connectors/source-apify-dataset/Dockerfile
@@ -34,5 +34,5 @@ COPY source_apify_dataset ./source_apify_dataset
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=1.0.0
+LABEL io.airbyte.version=2.0.0
 LABEL io.airbyte.name=airbyte/source-apify-dataset
diff --git a/airbyte-integrations/connectors/source-apify-dataset/README.md b/airbyte-integrations/connectors/source-apify-dataset/README.md
@@ -5,15 +5,50 @@ For information about how to use this connector within Airbyte, see [the documen
 
 ## Local development
 
+#### Building via Python
+
+Create a Python virtual environment
+
+```
+virtualenv --python $(which python3.10) .venv
+```
+
+Source it
+
+```
+source .venv/bin/activate
+```
+
+Check connector specifications/definition
+
+```
+python main.py spec
+```
+
+Basic check - check connection to the API
+
+```
+python main.py check --config secrets/config.json
+```
+
+Integration tests - read operation from the API
+
+```
+python main.py read --config secrets/config.json --catalog integration_tests/configured_catalog.json
+```
+
 #### Building via Gradle
+
 You can also build the connector in Gradle. This is typically used in CI and not needed for your development workflow.
 
 To build using Gradle, from the Airbyte repository root, run:
+
 ```
 ./gradlew :airbyte-integrations:connectors:source-apify-dataset:build
 ```
 
 #### Create credentials
+
 **If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.com/integrations/sources/apify-dataset)
 to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_apify_dataset/spec.yaml` file.
 Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information.
@@ -25,56 +60,73 @@ and place them into `secrets/config.json`.
 ### Locally running the connector docker image
 
 #### Build
+
 First, make sure you build the latest Docker image:
+
 ```
 docker build . -t airbyte/source-apify-dataset:dev
 ```
 
 You can also build the connector image via Gradle:
+
 ```
 ./gradlew :airbyte-integrations:connectors:source-apify-dataset:airbyteDocker
 ```
+
 When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in
 the Dockerfile.
 
 #### Run
+
 Then run any of the connector commands as follows:
+
 ```
 docker run --rm airbyte/source-apify-dataset:dev spec
 docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-apify-dataset:dev check --config /secrets/config.json
 docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-apify-dataset:dev discover --config /secrets/config.json
 docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-apify-dataset:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json
 ```
+
 ## Testing
 
 #### Acceptance Tests
+
 Customize `acceptance-test-config.yml` file to configure tests. See [Connector Acceptance Tests](https://docs.airbyte.com/connector-development/testing-connectors/connector-acceptance-tests-reference) for more information.
 If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py.
 
 To run your integration tests with Docker, run:
+
 ```
 ./acceptance-test-docker.sh
 ```
 
 ### Using gradle to run tests
+
 All commands should be run from airbyte project root.
 To run unit tests:
+
 ```
 ./gradlew :airbyte-integrations:connectors:source-apify-dataset:unitTest
 ```
+
 To run acceptance and custom integration tests:
+
 ```
 ./gradlew :airbyte-integrations:connectors:source-apify-dataset:integrationTest
 ```
 
 ## Dependency Management
+
 All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development.
 We split dependencies between two groups, dependencies that are:
-* required for your connector to work need to go to `MAIN_REQUIREMENTS` list.
-* required for the testing need to go to `TEST_REQUIREMENTS` list
+
+- required for your connector to work need to go to `MAIN_REQUIREMENTS` list.
+- required for the testing need to go to `TEST_REQUIREMENTS` list
 
 ### Publishing a new version of the connector
+
 You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what?
+
 1. Make sure your changes are passing unit and integration tests.
 1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)).
 1. Create a Pull Request.

diff --git a/airbyte-integrations/connectors/source-apify-dataset/acceptance-test-config.yml b/airbyte-integrations/connectors/source-apify-dataset/acceptance-test-config.yml
@@ -4,7 +4,7 @@ acceptance_tests:
     tests:
       - spec_path: "source_apify_dataset/spec.yaml"
         backward_compatibility_tests_config:
-          disable_for_version: 0.2.0
+          disable_for_version: 2.0.0
   connection:
     tests:
       - config_path: "secrets/config.json"
@@ -15,7 +15,7 @@ acceptance_tests:
     tests:
       - config_path: "secrets/config.json"
         backward_compatibility_tests_config:
-          disable_for_version: 0.2.0
+          disable_for_version: 2.0.0
   basic_read:
     tests:
       - config_path: "secrets/config.json"
@@ -32,7 +32,7 @@ acceptance_tests:
       - config_path: "secrets/config.json"
         configured_catalog_path: "integration_tests/configured_catalog.json"
         ignored_fields:
-          datasets:
+          dataset_collection:
             - name: "accessedAt"
               bypass_reason: "Change everytime"
             - name: "stats/readCount"

diff --git a/...te-integrations/connectors/source-apify-dataset/integration_tests/configured_catalog.json b/...te-integrations/connectors/source-apify-dataset/integration_tests/configured_catalog.json
@@ -2,7 +2,7 @@
   "streams": [
     {
       "stream": {
-        "name": "datasets",
+        "name": "dataset_collection",
         "json_schema": {},
         "supported_sync_modes": ["full_refresh"]
       },
@@ -20,7 +20,7 @@
     },
     {
       "stream": {
-        "name": "item_collection",
+        "name": "item_collection_website_content_crawler",
         "json_schema": {},
         "supported_sync_modes": ["full_refresh"]
       },

diff --git a/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml b/airbyte-integrations/connectors/source-apify-dataset/metadata.yaml
@@ -11,7 +11,7 @@ data:
   connectorSubtype: api
   connectorType: source
   definitionId: 47f17145-fe20-4ef5-a548-e29b048adf84
-  dockerImageTag: 1.0.0
+  dockerImageTag: 2.0.0
   dockerRepository: airbyte/source-apify-dataset
   githubIssueLabel: source-apify-dataset
   icon: apify-dataset.svg
@@ -24,6 +24,9 @@ data:
       1.0.0:
         upgradeDeadline: 2023-08-30
         message: "Update spec to use token and ingest all 3 streams correctly"
+      2.0.0:
+        upgradeDeadline: 2023-09-18
+        message: "Fix broken stream, manifest refactor"
   supportLevel: community
   documentationUrl: https://docs.airbyte.com/integrations/sources/apify-dataset
   tags:

diff --git a/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml b/airbyte-integrations/connectors/source-apify-dataset/source_apify_dataset/manifest.yaml
@@ -1,109 +1,115 @@
-version: "0.29.0"
+version: "0.51.11"
+type: DeclarativeSource
 
-definitions:
-  selector:
-    type: RecordSelector
-    extractor:
-      type: DpathExtractor
-      field_path: ["data"]
-  requester:
-    type: HttpRequester
-    url_base: "https://api.apify.com/v2/"
-    http_method: "GET"
-    authenticator:
-      type: NoAuth
-    request_parameters:
-      token: "{{ config['token'] }}"
+spec:
+  type: Spec
+  documentation_url: https://docs.airbyte.com/integrations/sources/apify-dataset
+  connection_specification:
+    $schema: http://json-schema.org/draft-07/schema#
+    title: Apify Dataset Spec
+    type: object
+    required:
+      - token
+      - dataset_id
+    properties:
+      token:
+        type: string
+        title: API token
+        description: >-
+          Personal API token of your Apify account. In Apify Console, you can find your API token in the
+          <a href="https://console.apify.com/account/integrations">Settings section under the Integrations tab</a>
+          after you login. See the <a href="https://docs.apify.com/platform/integrations/api#api-token">Apify Docs</a>
+          for more information.
+        examples:
+          - apify_api_PbVwb1cBbuvbfg2jRmAIHZKgx3NQyfEMG7uk
+        airbyte_secret: true
+      dataset_id:
+        type: string
+        title: Dataset ID
+        description: >-
+          ID of the dataset you would like to load to Airbyte. In Apify Console, you can view your datasets in the
+          <a href="https://console.apify.com/storage/datasets">Storage section under the Datasets tab</a>
+          after you login. See the <a href="https://docs.apify.com/platform/storage/dataset">Apify Docs</a>
+          for more information.
+        examples:
+          - rHuMdwm6xCFt6WiGU
+    additionalProperties: true
 
+definitions:
   retriever:
     type: SimpleRetriever
-    record_selector:
-      $ref: "#/definitions/selector"
-    paginator:
-      type: "NoPagination"
     requester:
-      $ref: "#/definitions/requester"
-
-  base_paginator:
-    type: "DefaultPaginator"
-    page_size_option:
-      type: "RequestOption"
-      inject_into: "request_parameter"
-      field_name: "limit"
-    pagination_strategy:
-      type: "OffsetIncrement"
-      page_size: 50
-    page_token_option:
-      type: "RequestOption"
-      field_name: "offset"
-      inject_into: "request_parameter"
-
-  base_stream:
-    type: DeclarativeStream
-    retriever:
-      $ref: "#/definitions/retriever"
+      type: HttpRequester
+      url_base: "https://api.apify.com/v2/"
+      http_method: "GET"
+      authenticator:
+        type: BearerAuthenticator
+        api_token: "{{ config['token'] }}"
+    paginator:
+      type: "DefaultPaginator"
+      page_size_option:
+        type: "RequestOption"
+        inject_into: "request_parameter"
+        field_name: "limit"
+      pagination_strategy:
+        type: "OffsetIncrement"
+        page_size: 50
+      page_token_option:
+        type: "RequestOption"
+        field_name: "offset"
+        inject_into: "request_parameter"
 
-  datasets_stream:
-    $ref: "#/definitions/base_stream"
+streams:
+  - type: DeclarativeStream
+    name: dataset_collection
+    primary_key: "id"
     $parameters:
-      name: "datasets"
-      primary_key: "id"
       path: "datasets"
+    schema_loader:
+      type: JsonFileSchemaLoader
+      file_path: "./source_apify_dataset/schemas/dataset_collection.json"
     retriever:
       $ref: "#/definitions/retriever"
-      paginator:
-        $ref: "#/definitions/base_paginator"
       record_selector:
-        $ref: "#/definitions/selector"
+        type: RecordSelector
         extractor:
           type: DpathExtractor
           field_path: ["data", "items"]
 
-  datasets_partition_router:
-    type: SubstreamPartitionRouter
-    parent_stream_configs:
-      - stream: "#/definitions/datasets_stream"
-        parent_key: "id"
-        partition_field: "parent_id"
-
-  dataset_stream:
-    $ref: "#/definitions/base_stream"
+  - type: DeclarativeStream
+    name: dataset
+    primary_key: "id"
     $parameters:
-      name: "dataset"
-      primary_key: "id"
-      path: "datasets/{{ stream_partition.parent_id }}"
+      path: "datasets/{{ config['dataset_id'] }}"
+    schema_loader:
+      type: JsonFileSchemaLoader
+      file_path: "./source_apify_dataset/schemas/dataset.json"
     retriever:
       $ref: "#/definitions/retriever"
-      paginator:
-        $ref: "#/definitions/base_paginator"
-      partition_router:
-        $ref: "#/definitions/datasets_partition_router"
+      record_selector:
+        type: RecordSelector
+        extractor:
+          type: DpathExtractor
+          field_path: ["data"]
 
-  item_collection_stream:
-    $ref: "#/definitions/base_stream"
+  - type: DeclarativeStream
+    name: item_collection_website_content_crawler
     $parameters:
-      name: "item_collection"
-      path: "datasets/{{ stream_partition.parent_id }}/items"
+      path: "datasets/{{ config['dataset_id'] }}/items"
+    schema_loader:
+      type: JsonFileSchemaLoader
+      file_path: "./source_apify_dataset/schemas/item_collection_wcc.json"
     retriever:
       $ref: "#/definitions/retriever"
-      paginator:
-        $ref: "#/definitions/base_paginator"
       record_selector:
-        $ref: "#/definitions/selector"
+        type: RecordSelector
         extractor:
           type: DpathExtractor
           field_path: []
-      partition_router:
-        $ref: "#/definitions/datasets_partition_router"
-
-streams:
-  - "#/definitions/datasets_stream"
-  - "#/definitions/dataset_stream"
-  - "#/definitions/item_collection_stream"
 
 check:
   type: CheckStream
   stream_names:
-    - "datasets"
-    - "dataset"
-    - "item_collection"
+    - dataset_collection
+    - dataset
+    - item_collection_website_content_crawler