From efdc945d42e9e5350b3448ed9f420d14d28171fb Mon Sep 17 00:00:00 2001 From: Daryna Ishchenko <80129833+darynaishchenko@users.noreply.github.com> Date: Fri, 13 Oct 2023 19:24:45 +0300 Subject: [PATCH] :sparkles: Source Gitlab: spec: start date is optional, groups and projects to array (#31375) --- .../connectors/source-gitlab/Dockerfile | 2 +- .../source-gitlab/acceptance-test-config.yml | 2 + .../connectors/source-gitlab/main.py | 3 + .../connectors/source-gitlab/metadata.yaml | 2 +- .../source_gitlab/config_migrations.py | 106 ++++++++++++++++++ .../source-gitlab/source_gitlab/source.py | 13 ++- .../source-gitlab/source_gitlab/spec.json | 26 ++++- .../source-gitlab/source_gitlab/streams.py | 29 +++-- docs/integrations/sources/gitlab.md | 11 +- 9 files changed, 168 insertions(+), 26 deletions(-) create mode 100644 airbyte-integrations/connectors/source-gitlab/source_gitlab/config_migrations.py diff --git a/airbyte-integrations/connectors/source-gitlab/Dockerfile b/airbyte-integrations/connectors/source-gitlab/Dockerfile index d932f9c25b51..09e12d20c349 100644 --- a/airbyte-integrations/connectors/source-gitlab/Dockerfile +++ b/airbyte-integrations/connectors/source-gitlab/Dockerfile @@ -13,5 +13,5 @@ COPY main.py ./ ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=1.8.0 +LABEL io.airbyte.version=1.8.1 LABEL io.airbyte.name=airbyte/source-gitlab diff --git a/airbyte-integrations/connectors/source-gitlab/acceptance-test-config.yml b/airbyte-integrations/connectors/source-gitlab/acceptance-test-config.yml index 60ad548d7c83..51408354b0b9 100644 --- a/airbyte-integrations/connectors/source-gitlab/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-gitlab/acceptance-test-config.yml @@ -4,6 +4,8 @@ acceptance_tests: spec: tests: - spec_path: "source_gitlab/spec.json" + backward_compatibility_tests_config: + disable_for_version: 1.8.0 connection: tests: - config_path: "secrets/config.json" diff --git a/airbyte-integrations/connectors/source-gitlab/main.py b/airbyte-integrations/connectors/source-gitlab/main.py index 6a0fe4988cc0..12b7cc841691 100644 --- a/airbyte-integrations/connectors/source-gitlab/main.py +++ b/airbyte-integrations/connectors/source-gitlab/main.py @@ -7,7 +7,10 @@ from airbyte_cdk.entrypoint import launch from source_gitlab import SourceGitlab +from source_gitlab.config_migrations import MigrateGroups, MigrateProjects if __name__ == "__main__": source = SourceGitlab() + MigrateGroups.migrate(sys.argv[1:], source) + MigrateProjects.migrate(sys.argv[1:], source) launch(source, sys.argv[1:]) diff --git a/airbyte-integrations/connectors/source-gitlab/metadata.yaml b/airbyte-integrations/connectors/source-gitlab/metadata.yaml index 9ead8e367bbc..1213c839c908 100644 --- a/airbyte-integrations/connectors/source-gitlab/metadata.yaml +++ b/airbyte-integrations/connectors/source-gitlab/metadata.yaml @@ -5,7 +5,7 @@ data: connectorSubtype: api connectorType: source definitionId: 5e6175e5-68e1-4c17-bff9-56103bbb0d80 - dockerImageTag: 1.8.0 + dockerImageTag: 1.8.1 dockerRepository: airbyte/source-gitlab githubIssueLabel: source-gitlab icon: gitlab.svg diff --git a/airbyte-integrations/connectors/source-gitlab/source_gitlab/config_migrations.py b/airbyte-integrations/connectors/source-gitlab/source_gitlab/config_migrations.py new file mode 100644 index 000000000000..0f963256cbcb --- /dev/null +++ b/airbyte-integrations/connectors/source-gitlab/source_gitlab/config_migrations.py @@ -0,0 +1,106 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# +import abc +import logging +from abc import ABC +from typing import Any, List, Mapping + +from airbyte_cdk.config_observation import create_connector_config_control_message +from airbyte_cdk.entrypoint import AirbyteEntrypoint +from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository + +from .source import SourceGitlab + +logger = logging.getLogger("airbyte_logger") + + +class MigrateStringToArray(ABC): + """ + This class stands for migrating the config at runtime, + while providing the backward compatibility when falling back to the previous source version. + + Specifically, starting from `1.7.1`, the `groups` and `projects` properties should be like : + > List(["", "", ..., ""]) + instead of, in ` 1.7.0`: + > JSON STR: "group1 group2" + """ + + message_repository: MessageRepository = InMemoryMessageRepository() + + @property + @abc.abstractmethod + def migrate_from_key(self) -> str: + ... + + @property + @abc.abstractmethod + def migrate_to_key(self) -> str: + ... + + @classmethod + def _should_migrate(cls, config: Mapping[str, Any]) -> bool: + """ + This method determines whether config require migration. + Returns: + > True, if the transformation is necessary + > False, otherwise. + """ + if cls.migrate_from_key in config and cls.migrate_to_key not in config: + return True + return False + + @classmethod + def _transform_to_array(cls, config: Mapping[str, Any], source: SourceGitlab = None) -> Mapping[str, Any]: + # assign old values to new property that will be used within the new version + config[cls.migrate_to_key] = config[cls.migrate_to_key] if cls.migrate_to_key in config else [] + data = set(filter(None, config.get(cls.migrate_from_key).split(" "))) + config[cls.migrate_to_key] = list(data | set(config[cls.migrate_to_key])) + return config + + @classmethod + def _modify_and_save(cls, config_path: str, source: SourceGitlab, config: Mapping[str, Any]) -> Mapping[str, Any]: + # modify the config + migrated_config = cls._transform_to_array(config, source) + # save the config + source.write_config(migrated_config, config_path) + # return modified config + return migrated_config + + @classmethod + def _emit_control_message(cls, migrated_config: Mapping[str, Any]) -> None: + # add the Airbyte Control Message to message repo + cls.message_repository.emit_message(create_connector_config_control_message(migrated_config)) + # emit the Airbyte Control Message from message queue to stdout + for message in cls.message_repository._message_queue: + print(message.json(exclude_unset=True)) + + @classmethod + def migrate(cls, args: List[str], source: SourceGitlab) -> None: + """ + This method checks the input args, should the config be migrated, + transform if necessary and emit the CONTROL message. + """ + # get config path + config_path = AirbyteEntrypoint(source).extract_config(args) + # proceed only if `--config` arg is provided + if config_path: + # read the existing config + config = source.read_config(config_path) + # migration check + if cls._should_migrate(config): + cls._emit_control_message( + cls._modify_and_save(config_path, source, config), + ) + + +class MigrateGroups(MigrateStringToArray): + + migrate_from_key: str = "groups" + migrate_to_key: str = "groups_list" + + +class MigrateProjects(MigrateStringToArray): + + migrate_from_key: str = "projects" + migrate_to_key: str = "projects_list" diff --git a/airbyte-integrations/connectors/source-gitlab/source_gitlab/source.py b/airbyte-integrations/connectors/source-gitlab/source_gitlab/source.py index eafe4492fc96..893727ae2744 100644 --- a/airbyte-integrations/connectors/source-gitlab/source_gitlab/source.py +++ b/airbyte-integrations/connectors/source-gitlab/source_gitlab/source.py @@ -108,7 +108,7 @@ def _groups_stream(self, config: MutableMapping[str, Any]) -> Groups: def _projects_stream(self, config: MutableMapping[str, Any]) -> Union[Projects, GroupProjects]: if not self.__projects_stream: auth_params = self._auth_params(config) - project_ids = list(filter(None, config.get("projects", "").split(" "))) + project_ids = config.get("projects_list", []) groups_stream = self._groups_stream(config) if groups_stream.group_ids: self.__projects_stream = GroupProjects(project_ids=project_ids, parent_stream=groups_stream, **auth_params) @@ -123,7 +123,7 @@ def _auth_params(self, config: MutableMapping[str, Any]) -> Mapping[str, Any]: return self.__auth_params def _get_group_list(self, config: MutableMapping[str, Any]) -> List[str]: - group_ids = list(filter(None, config.get("groups", "").split(" "))) + group_ids = config.get("groups_list") # Gitlab exposes different APIs to get a list of groups. # We use https://docs.gitlab.com/ee/api/groups.html#list-groups in case there's no group IDs in the input config. # This API provides full information about all available groups, including subgroups. @@ -198,22 +198,23 @@ def check_connection(self, logger, config) -> Tuple[bool, Any]: def streams(self, config: MutableMapping[str, Any]) -> List[Stream]: config = self._ensure_default_values(config) auth_params = self._auth_params(config) + start_date = config.get("start_date") groups, projects = self._groups_stream(config), self._projects_stream(config) - pipelines = Pipelines(parent_stream=projects, start_date=config["start_date"], **auth_params) - merge_requests = MergeRequests(parent_stream=projects, start_date=config["start_date"], **auth_params) + pipelines = Pipelines(parent_stream=projects, start_date=start_date, **auth_params) + merge_requests = MergeRequests(parent_stream=projects, start_date=start_date, **auth_params) epics = Epics(parent_stream=groups, **auth_params) streams = [ groups, projects, Branches(parent_stream=projects, repository_part=True, **auth_params), - Commits(parent_stream=projects, repository_part=True, start_date=config["start_date"], **auth_params), + Commits(parent_stream=projects, repository_part=True, start_date=start_date, **auth_params), epics, Deployments(parent_stream=projects, **auth_params), EpicIssues(parent_stream=epics, **auth_params), GroupIssueBoards(parent_stream=groups, **auth_params), - Issues(parent_stream=projects, start_date=config["start_date"], **auth_params), + Issues(parent_stream=projects, start_date=start_date, **auth_params), Jobs(parent_stream=pipelines, **auth_params), ProjectMilestones(parent_stream=projects, **auth_params), GroupMilestones(parent_stream=groups, **auth_params), diff --git a/airbyte-integrations/connectors/source-gitlab/source_gitlab/spec.json b/airbyte-integrations/connectors/source-gitlab/source_gitlab/spec.json index 9061dadafb07..75b2fc3c487c 100644 --- a/airbyte-integrations/connectors/source-gitlab/source_gitlab/spec.json +++ b/airbyte-integrations/connectors/source-gitlab/source_gitlab/spec.json @@ -4,7 +4,7 @@ "$schema": "http://json-schema.org/draft-07/schema#", "title": "Source Gitlab Spec", "type": "object", - "required": ["start_date", "credentials"], + "required": ["credentials"], "additionalProperties": true, "properties": { "credentials": { @@ -76,7 +76,7 @@ "start_date": { "type": "string", "title": "Start Date", - "description": "The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.", + "description": "The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. Optional. If not set, all data will be replicated. All data generated after this date will be replicated.", "examples": ["2021-03-01T00:00:00Z"], "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$", "order": 1, @@ -98,13 +98,33 @@ "type": "string", "examples": ["airbyte.io"], "title": "Groups", - "description": "Space-delimited list of groups. e.g. airbyte.io.", + "description": "[DEPRECATED] Space-delimited list of groups. e.g. airbyte.io.", + "airbyte_hidden": true + }, + "groups_list": { + "type": "array", + "items": { + "type": "string" + }, + "examples": ["airbyte.io"], + "title": "Groups", + "description": "List of groups. e.g. airbyte.io.", "order": 3 }, "projects": { "type": "string", "title": "Projects", "examples": ["airbyte.io/documentation"], + "description": "[DEPRECATED] Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.", + "airbyte_hidden": true + }, + "projects_list": { + "type": "array", + "items": { + "type": "string" + }, + "title": "Projects", + "examples": ["airbyte.io/documentation"], "description": "Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.", "order": 4 } diff --git a/airbyte-integrations/connectors/source-gitlab/source_gitlab/streams.py b/airbyte-integrations/connectors/source-gitlab/source_gitlab/streams.py index 0c947f05cf87..d561009f13bc 100644 --- a/airbyte-integrations/connectors/source-gitlab/source_gitlab/streams.py +++ b/airbyte-integrations/connectors/source-gitlab/source_gitlab/streams.py @@ -187,22 +187,31 @@ def stream_slices( stream_state = stream_state or {} super_slices = super().stream_slices(sync_mode, cursor_field, stream_state) for super_slice in super_slices: - start_point = self._start_date state_project_value = stream_state.get(str(super_slice["id"])) - if state_project_value: - state_value = state_project_value.get(self.cursor_field) - if state_value: - start_point = max(start_point, state_value) - for start_dt, end_dt in self._chunk_date_range(pendulum.parse(start_point)): + if self._start_date or state_project_value: + start_point = self._start_date + if state_project_value: + state_value = state_project_value.get(self.cursor_field) + if state_value and start_point: + start_point = max(start_point, state_value) + else: + start_point = state_value or start_point + for start_dt, end_dt in self._chunk_date_range(pendulum.parse(start_point)): + stream_slice = {key: value for key, value in super_slice.items()} + stream_slice[self.lower_bound_filter] = start_dt + stream_slice[self.upper_bound_filter] = end_dt + yield stream_slice + else: stream_slice = {key: value for key, value in super_slice.items()} - stream_slice[self.lower_bound_filter] = start_dt - stream_slice[self.upper_bound_filter] = end_dt yield stream_slice def request_params(self, stream_state=None, stream_slice: Mapping[str, Any] = None, **kwargs): params = super().request_params(stream_state, stream_slice, **kwargs) - params[self.lower_bound_filter] = stream_slice[self.lower_bound_filter] - params[self.upper_bound_filter] = stream_slice[self.upper_bound_filter] + lower_bound_filter = stream_slice.get(self.lower_bound_filter) + upper_bound_filter = stream_slice.get(self.upper_bound_filter) + if lower_bound_filter and upper_bound_filter: + params[self.lower_bound_filter] = lower_bound_filter + params[self.upper_bound_filter] = upper_bound_filter return params diff --git a/docs/integrations/sources/gitlab.md b/docs/integrations/sources/gitlab.md index 6ae09bea1096..6e82a57836bc 100644 --- a/docs/integrations/sources/gitlab.md +++ b/docs/integrations/sources/gitlab.md @@ -5,7 +5,7 @@ This page contains the setup guide and reference information for the Gitlab Sour ## Prerequisites - Gitlab instance or an account at [Gitlab](https://gitlab.com) -- Start date +- Start date (Optional) - GitLab Groups (Optional) - GitLab Projects (Optional) @@ -49,10 +49,10 @@ Log into [GitLab](https://gitlab.com) and then generate a [personal access token 3. On the source setup page, select **GitLab** from the Source type dropdown and enter a name for this connector. 4. Click `Authenticate your GitLab account` by selecting Oauth or Personal Access Token for Authentication. 5. Log in and Authorize to the GitLab account. -6. **Start date** - The date from which you'd like to replicate data for streams. -7. **API URL** - The URL to access you self-hosted GitLab instance or `gitlab.com` (default). -8. **Groups (Optional)** - Space-delimited list of GitLab group IDs, e.g. `airbytehq` for single group, `airbytehq another-repo` for multiple groups. -9. **Projects (Optional)** - Space-delimited list of GitLab projects to pull data for, e.g. `airbytehq/airbyte`. +6. **API URL** - The URL to access you self-hosted GitLab instance or `gitlab.com` (default). +7. **Start date (Optional)** - The date from which you'd like to replicate data for streams. +8. **Groups (Optional)** - List of GitLab group IDs, e.g. `airbytehq` for single group, `airbytehq another-repo` for multiple groups. +9. **Projects (Optional)** - List of GitLab projects to pull data for, e.g. `airbytehq/airbyte`. 10. Click **Set up source**. **Note:** You can specify either Group IDs or Project IDs in the source configuration. If both fields are blank, the connector will retrieve a list of all the groups that are accessible to the configured token and ingest as normal. @@ -112,6 +112,7 @@ Gitlab has the [rate limits](https://docs.gitlab.com/ee/user/gitlab_com/index.ht | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:-------------------------------------------------------------------------------------------| +| 1.8.1 | 2023-10-12 | [31375](https://github.com/airbytehq/airbyte/pull/31375) | Mark `start_date` as optional, migrate `groups` and `projects` to array | | 1.8.0 | 2023-10-12 | [31339](https://github.com/airbytehq/airbyte/pull/31339) | Add undeclared fields to streams schemas, validate date/date-time format in stream schemas | | 1.7.1 | 2023-10-10 | [31210](https://github.com/airbytehq/airbyte/pull/31210) | Added expired `access_token` handling, while checking the connection | | 1.7.0 | 2023-08-08 | [27869](https://github.com/airbytehq/airbyte/pull/29203) | Add Deployments stream |