From 2332bb45e8d866b3ee08edfdd8adc49a4b534663 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Thu, 28 Dec 2023 15:00:54 +0200 Subject: [PATCH] Update version and add tests --- .../integration_tests/cloud_spec.json | 7 ++ .../source-s3/integration_tests/spec.json | 7 ++ .../connectors/source-s3/metadata.yaml | 2 +- .../connectors/source-s3/setup.py | 8 +- .../connectors/source-s3/source_s3/source.py | 2 +- .../source-s3/source_s3/v4/config.py | 4 +- .../source-s3/source_s3/v4/stream_reader.py | 62 +++++++++------- .../unit_tests/v4/test_stream_reader.py | 31 ++++++++ docs/integrations/sources/s3.md | 74 +++++++++++++++++-- 9 files changed, 153 insertions(+), 44 deletions(-) diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/cloud_spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/cloud_spec.json index b2593df8e5c9..64e18a28c411 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/cloud_spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/cloud_spec.json @@ -358,6 +358,13 @@ "order": 2, "type": "string" }, + "role_arn": { + "title": "AWS Role ARN", + "description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set External ID as 'None'.", + "airbyte_secret": true, + "order": 6, + "type": "string" + }, "aws_secret_access_key": { "title": "AWS Secret Access Key", "description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.", diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json index dd3309240ada..e54314c58594 100644 --- a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json @@ -365,6 +365,13 @@ "order": 3, "type": "string" }, + "role_arn": { + "title": "AWS Role ARN", + "description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set External ID as 'None'.", + "airbyte_secret": true, + "order": 6, + "type": "string" + }, "endpoint": { "title": "Endpoint", "description": "Endpoint to an S3 compatible service. Leave empty to use AWS.", diff --git a/airbyte-integrations/connectors/source-s3/metadata.yaml b/airbyte-integrations/connectors/source-s3/metadata.yaml index f3bed3cd1ff4..d660125e526b 100644 --- a/airbyte-integrations/connectors/source-s3/metadata.yaml +++ b/airbyte-integrations/connectors/source-s3/metadata.yaml @@ -10,7 +10,7 @@ data: connectorSubtype: file connectorType: source definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2 - dockerImageTag: 4.3.0 + dockerImageTag: 4.4.0 dockerRepository: airbyte/source-s3 documentationUrl: https://docs.airbyte.com/integrations/sources/s3 githubIssueLabel: source-s3 diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py index 3171ff163f0e..7343e25775a4 100644 --- a/airbyte-integrations/connectors/source-s3/setup.py +++ b/airbyte-integrations/connectors/source-s3/setup.py @@ -14,13 +14,7 @@ "python-snappy==0.6.1", ] -TEST_REQUIREMENTS = [ - "requests-mock~=1.9.3", - "pytest-mock~=3.6.1", - "pytest~=6.1", - "pandas==2.0.3", - "docker", -] +TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.1", "pandas==2.0.3", "docker", "moto"] setup( name="source_s3", diff --git a/airbyte-integrations/connectors/source-s3/source_s3/source.py b/airbyte-integrations/connectors/source-s3/source_s3/source.py index 99ac3710b337..8e9cb035b713 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/source.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/source.py @@ -43,7 +43,7 @@ class Config: order=2, ) role_arn: Optional[str] = Field( - title="AWS Role ARN", + title=f"AWS Role ARN (External ID is '{AWS_EXTERNAL_ID}')" if AWS_EXTERNAL_ID else "AWS Role ARN", default=None, description="Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations " f"requested using this profile. Set External ID as '{AWS_EXTERNAL_ID}'.", diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py index 40c526f3f016..b96017de003a 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py @@ -35,10 +35,10 @@ def documentation_url(cls) -> AnyUrl: ) role_arn: Optional[str] = Field( - title="AWS Role ARN", + title=f"AWS Role ARN (External ID is '{AWS_EXTERNAL_ID}')" if AWS_EXTERNAL_ID else "AWS Role ARN", default=None, description="Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations " - f"requested using this profile. Set External ID as {AWS_EXTERNAL_ID}.", + f"requested using this profile. Set External ID as '{AWS_EXTERNAL_ID}'.", airbyte_secret=True, order=6, ) diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py index 60f9a49de3ba..d544818c2e5a 100644 --- a/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py +++ b/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py @@ -60,32 +60,7 @@ def s3_client(self) -> BaseClient: client_kv_args = _get_s3_compatible_client_args(self.config) if self.config.endpoint else {} if self.config.role_arn: - - def refresh(): - client = boto3.client("sts") - role = client.assume_role( - RoleArn=self.config.role_arn, - RoleSessionName="airbyte-source-s3", - ExternalId=AWS_EXTERNAL_ID, - ) - creds = role.get("Credentials", {}) - return { - "access_key": creds["AccessKeyId"], - "secret_key": creds["SecretAccessKey"], - "token": creds["SessionToken"], - "expiry_time": creds["Expiration"].isoformat(), - } - - session_credentials = RefreshableCredentials.create_from_metadata( - metadata=refresh(), - refresh_using=refresh, - method="sts-assume-role", - ) - - session = get_session() - session._credentials = session_credentials - autorefresh_session = boto3.Session(botocore_session=session) - self._s3_client = autorefresh_session.client("s3", **client_kv_args) + self._s3_client = self._get_iam_s3_client(client_kv_args) else: self._s3_client = boto3.client( "s3", @@ -96,6 +71,41 @@ def refresh(): return self._s3_client + def _get_iam_s3_client(self, client_kv_args: dict) -> BaseClient: + def refresh(): + client = boto3.client("sts") + if AWS_EXTERNAL_ID: + role = client.assume_role( + RoleArn=self.config.role_arn, + RoleSessionName="airbyte-source-s3", + ExternalId=AWS_EXTERNAL_ID, + ) + else: + role = client.assume_role( + RoleArn=self.config.role_arn, + RoleSessionName="airbyte-source-s3", + ) + + creds = role.get("Credentials", {}) + return { + "access_key": creds["AccessKeyId"], + "secret_key": creds["SecretAccessKey"], + "token": creds["SessionToken"], + "expiry_time": creds["Expiration"].isoformat(), + } + + session_credentials = RefreshableCredentials.create_from_metadata( + metadata=refresh(), + refresh_using=refresh, + method="sts-assume-role", + ) + + session = get_session() + session._credentials = session_credentials + autorefresh_session = boto3.Session(botocore_session=session) + + return autorefresh_session.client("s3", **client_kv_args) + def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]: """ Get all files matching the specified glob patterns. diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_stream_reader.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_stream_reader.py index 05d7f7873be1..b1bede862d22 100644 --- a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_stream_reader.py +++ b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_stream_reader.py @@ -16,6 +16,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile from botocore.stub import Stubber +from moto import mock_sts from pydantic import AnyUrl from source_s3.v4.config import Config from source_s3.v4.stream_reader import SourceS3StreamReader @@ -238,3 +239,33 @@ def set_stub(reader: SourceS3StreamReader, contents: List[Dict[str, Any]], multi ) s3_stub.activate() return s3_stub + + +@mock_sts +@patch("source_s3.v4.stream_reader.boto3.client") +def test_get_iam_s3_client(boto3_client_mock): + # Mock the STS client assume_role method + boto3_client_mock.return_value.assume_role.return_value = { + "Credentials": { + "AccessKeyId": "assumed_access_key_id", + "SecretAccessKey": "assumed_secret_access_key", + "SessionToken": "assumed_session_token", + "Expiration": datetime.now(), + } + } + + # Instantiate your stream reader and set the config + reader = SourceS3StreamReader() + reader.config = Config( + bucket="test", + role_arn="arn:aws:iam::123456789012:role/my-role", + streams=[], + endpoint=None, + ) + + # Call _get_iam_s3_client + with Stubber(reader.s3_client): + s3_client = reader._get_iam_s3_client({}) + + # Assertions to validate the s3 client + assert s3_client is not None diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md index 533d25b29285..f11156d77604 100644 --- a/docs/integrations/sources/s3.md +++ b/docs/integrations/sources/s3.md @@ -15,7 +15,9 @@ Please note that using cloud storage may incur egress costs. Egress refers to da ### Step 1: Set up Amazon S3 -**If you are syncing from a private bucket**, you will need to provide both an `AWS Access Key ID` and `AWS Secret Access Key` to authenticate the connection. The IAM user associated with the credentials must be granted `read` and `list` permissions for the bucket and its objects. If you are unfamiliar with configuring AWS permissions, you can follow these steps to obtain the necessary permissions and credentials: +**If you are syncing from a private bucket**, you need to authenticate the connection. This can be done either by using an `IAM User` (with `AWS Access Key ID` and `Secret Access Key`) or an `IAM Role` (with `Role ARN`). Begin by creating a policy with the necessary permissions: + +#### Create a Policy 1. Log in to your Amazon AWS account and open the [IAM console](https://console.aws.amazon.com/iam/home#home). 2. In the IAM dashboard, select **Policies**, then click **Create Policy**. @@ -45,10 +47,63 @@ At this time, object-level permissions alone are not sufficient to successfully ::: 4. Give your policy a descriptive name, then click **Create policy**. -5. In the IAM dashboard, click **Users**. Select an existing IAM user or create a new one by clicking **Add users**. -6. If you are using an _existing_ IAM user, click the **Add permissions** dropdown menu and select **Add permissions**. If you are creating a _new_ user, you will be taken to the Permissions screen after selecting a name. -7. Select **Attach policies directly**, then find and check the box for your new policy. Click **Next**, then **Add permissions**. -8. After successfully creating your user, select the **Security credentials** tab and click **Create access key**. You will be prompted to select a use case and add optional tags to your access key. Click **Create access key** to generate the keys. + +#### Option 1: Using an IAM Role (Most secure) + +1. In the IAM dashboard, click **Roles**, then **Create role**. +2. Choose the appropriate trust entity and attach the policy you created. +3. Set up a trust relationship for the role. For example for **AWS account** trusted entity use default AWS account on your instance (it will be used to assume role). To use **External ID** set it to environment variables as `export AWS_EXTERNAL_ID="{your-external-id}"`. Edit the trust relationship policy to reflect this: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::{your-aws-account-id}:user/{your-username}" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "{your-external-id}" + } + } + } + ] +} +``` + + +2. Choose the **AWS account** trusted entity type. +3. Set up a trust relationship for the role. This allows the Airbyte instance's AWS account to assume this role. You will also need to specify an external ID, which is a secret key that the trusting service (Airbyte) and the trusted role (the role you're creating) both know. This ID is used to prevent the "confused deputy" problem. You can find External ID in **AWS Role ARN** property title or description. Edit the trust relationship policy to include the external ID: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::{airbyte-aws-account-id}:user/{your-username}" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "{your-external-id}" + } + } + } + ] +} +``` + +4. Complete the role creation and note the Role ARN. + +#### Option 2: Using an IAM User + +1. In the IAM dashboard, click **Users**. Select an existing IAM user or create a new one by clicking **Add users**. +2. If you are using an _existing_ IAM user, click the **Add permissions** dropdown menu and select **Add permissions**. If you are creating a _new_ user, you will be taken to the Permissions screen after selecting a name. +3. Select **Attach policies directly**, then find and check the box for your new policy. Click **Next**, then **Add permissions**. +4. After successfully creating your user, select the **Security credentials** tab and click **Create access key**. You will be prompted to select a use case and add optional tags to your access key. Click **Create access key** to generate the keys. :::caution Your `Secret Access Key` will only be visible once upon creation. Be sure to copy and store it securely for future use. @@ -69,7 +124,11 @@ For more information on managing your access keys, please refer to the 3. Give a **Name** to the stream 4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema). 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. -6. **If you are syncing from a private bucket**, you must fill the **AWS Access Key ID** and **AWS Secret Access Key** fields with the appropriate credentials to authenticate the connection. All other fields are optional and can be left empty. Refer to the [S3 Provider Settings section](#s3-provider-settings) below for more information on each field. +6. **To authenticate your private bucket**: + - If using an IAM role, enter the **AWS Role ARN**. + - If using IAM user credentials, fill the **AWS Access Key ID** and **AWS Secret Access Key** fields with the appropriate credentials. + +All other fields are optional and can be left empty. Refer to the [S3 Provider Settings section](#s3-provider-settings) below for more information on each field. ## Supported sync modes @@ -256,7 +315,8 @@ To perform the text extraction from PDF and Docx files, the connector uses the [ | Version | Date | Pull Request | Subject | |:--------|:-----------|:----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------| -| 4.3.0 | 2023-12-14 | [33411](https://github.com/airbytehq/airbyte/pull/33411) | Bump CDK version to auto-set primary key for document file streams and support raw txt files | +| 4.4.0 | 2023-12-28 | [33818](https://github.com/airbytehq/airbyte/pull/33818) | Add IAM Role Authentication | +| 4.3.0 | 2023-12-14 | [33411](https://github.com/airbytehq/airbyte/pull/33411) | Bump CDK version to auto-set primary key for document file streams and support raw txt files | | 4.2.4 | 2023-12-06 | [33187](https://github.com/airbytehq/airbyte/pull/33187) | Bump CDK version to hide source-defined primary key | | 4.2.3 | 2023-11-16 | [32608](https://github.com/airbytehq/airbyte/pull/32608) | Improve document file type parser | | 4.2.2 | 2023-11-20 | [32677](https://github.com/airbytehq/airbyte/pull/32677) | Only read files with ".zip" extension as zipped files |