Skip to content

Commit

Permalink
Update version and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
tolik0 committed Dec 28, 2023
1 parent b8095e9 commit 2332bb4
Show file tree
Hide file tree
Showing 9 changed files with 153 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,13 @@
"order": 2,
"type": "string"
},
"role_arn": {
"title": "AWS Role ARN",
"description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set External ID as 'None'.",
"airbyte_secret": true,
"order": 6,
"type": "string"
},
"aws_secret_access_key": {
"title": "AWS Secret Access Key",
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,13 @@
"order": 3,
"type": "string"
},
"role_arn": {
"title": "AWS Role ARN",
"description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set External ID as 'None'.",
"airbyte_secret": true,
"order": 6,
"type": "string"
},
"endpoint": {
"title": "Endpoint",
"description": "Endpoint to an S3 compatible service. Leave empty to use AWS.",
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-s3/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
dockerImageTag: 4.3.0
dockerImageTag: 4.4.0
dockerRepository: airbyte/source-s3
documentationUrl: https://docs.airbyte.com/integrations/sources/s3
githubIssueLabel: source-s3
Expand Down
8 changes: 1 addition & 7 deletions airbyte-integrations/connectors/source-s3/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,7 @@
"python-snappy==0.6.1",
]

TEST_REQUIREMENTS = [
"requests-mock~=1.9.3",
"pytest-mock~=3.6.1",
"pytest~=6.1",
"pandas==2.0.3",
"docker",
]
TEST_REQUIREMENTS = ["requests-mock~=1.9.3", "pytest-mock~=3.6.1", "pytest~=6.1", "pandas==2.0.3", "docker", "moto"]

setup(
name="source_s3",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Config:
order=2,
)
role_arn: Optional[str] = Field(
title="AWS Role ARN",
title=f"AWS Role ARN (External ID is '{AWS_EXTERNAL_ID}')" if AWS_EXTERNAL_ID else "AWS Role ARN",
default=None,
description="Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations "
f"requested using this profile. Set External ID as '{AWS_EXTERNAL_ID}'.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def documentation_url(cls) -> AnyUrl:
)

role_arn: Optional[str] = Field(
title="AWS Role ARN",
title=f"AWS Role ARN (External ID is '{AWS_EXTERNAL_ID}')" if AWS_EXTERNAL_ID else "AWS Role ARN",
default=None,
description="Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations "
f"requested using this profile. Set External ID as {AWS_EXTERNAL_ID}.",
f"requested using this profile. Set External ID as '{AWS_EXTERNAL_ID}'.",
airbyte_secret=True,
order=6,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,32 +60,7 @@ def s3_client(self) -> BaseClient:
client_kv_args = _get_s3_compatible_client_args(self.config) if self.config.endpoint else {}

if self.config.role_arn:

def refresh():
client = boto3.client("sts")
role = client.assume_role(
RoleArn=self.config.role_arn,
RoleSessionName="airbyte-source-s3",
ExternalId=AWS_EXTERNAL_ID,
)
creds = role.get("Credentials", {})
return {
"access_key": creds["AccessKeyId"],
"secret_key": creds["SecretAccessKey"],
"token": creds["SessionToken"],
"expiry_time": creds["Expiration"].isoformat(),
}

session_credentials = RefreshableCredentials.create_from_metadata(
metadata=refresh(),
refresh_using=refresh,
method="sts-assume-role",
)

session = get_session()
session._credentials = session_credentials
autorefresh_session = boto3.Session(botocore_session=session)
self._s3_client = autorefresh_session.client("s3", **client_kv_args)
self._s3_client = self._get_iam_s3_client(client_kv_args)
else:
self._s3_client = boto3.client(
"s3",
Expand All @@ -96,6 +71,41 @@ def refresh():

return self._s3_client

def _get_iam_s3_client(self, client_kv_args: dict) -> BaseClient:
def refresh():
client = boto3.client("sts")
if AWS_EXTERNAL_ID:
role = client.assume_role(
RoleArn=self.config.role_arn,
RoleSessionName="airbyte-source-s3",
ExternalId=AWS_EXTERNAL_ID,
)
else:
role = client.assume_role(
RoleArn=self.config.role_arn,
RoleSessionName="airbyte-source-s3",
)

creds = role.get("Credentials", {})
return {
"access_key": creds["AccessKeyId"],
"secret_key": creds["SecretAccessKey"],
"token": creds["SessionToken"],
"expiry_time": creds["Expiration"].isoformat(),
}

session_credentials = RefreshableCredentials.create_from_metadata(
metadata=refresh(),
refresh_using=refresh,
method="sts-assume-role",
)

session = get_session()
session._credentials = session_credentials
autorefresh_session = boto3.Session(botocore_session=session)

return autorefresh_session.client("s3", **client_kv_args)

def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]:
"""
Get all files matching the specified glob patterns.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from airbyte_cdk.sources.file_based.file_based_stream_reader import FileReadMode
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
from botocore.stub import Stubber
from moto import mock_sts
from pydantic import AnyUrl
from source_s3.v4.config import Config
from source_s3.v4.stream_reader import SourceS3StreamReader
Expand Down Expand Up @@ -238,3 +239,33 @@ def set_stub(reader: SourceS3StreamReader, contents: List[Dict[str, Any]], multi
)
s3_stub.activate()
return s3_stub


@mock_sts
@patch("source_s3.v4.stream_reader.boto3.client")
def test_get_iam_s3_client(boto3_client_mock):
# Mock the STS client assume_role method
boto3_client_mock.return_value.assume_role.return_value = {
"Credentials": {
"AccessKeyId": "assumed_access_key_id",
"SecretAccessKey": "assumed_secret_access_key",
"SessionToken": "assumed_session_token",
"Expiration": datetime.now(),
}
}

# Instantiate your stream reader and set the config
reader = SourceS3StreamReader()
reader.config = Config(
bucket="test",
role_arn="arn:aws:iam::123456789012:role/my-role",
streams=[],
endpoint=None,
)

# Call _get_iam_s3_client
with Stubber(reader.s3_client):
s3_client = reader._get_iam_s3_client({})

# Assertions to validate the s3 client
assert s3_client is not None
74 changes: 67 additions & 7 deletions docs/integrations/sources/s3.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ Please note that using cloud storage may incur egress costs. Egress refers to da

### Step 1: Set up Amazon S3

**If you are syncing from a private bucket**, you will need to provide both an `AWS Access Key ID` and `AWS Secret Access Key` to authenticate the connection. The IAM user associated with the credentials must be granted `read` and `list` permissions for the bucket and its objects. If you are unfamiliar with configuring AWS permissions, you can follow these steps to obtain the necessary permissions and credentials:
**If you are syncing from a private bucket**, you need to authenticate the connection. This can be done either by using an `IAM User` (with `AWS Access Key ID` and `Secret Access Key`) or an `IAM Role` (with `Role ARN`). Begin by creating a policy with the necessary permissions:

#### Create a Policy

1. Log in to your Amazon AWS account and open the [IAM console](https://console.aws.amazon.com/iam/home#home).
2. In the IAM dashboard, select **Policies**, then click **Create Policy**.
Expand Down Expand Up @@ -45,10 +47,63 @@ At this time, object-level permissions alone are not sufficient to successfully
:::

4. Give your policy a descriptive name, then click **Create policy**.
5. In the IAM dashboard, click **Users**. Select an existing IAM user or create a new one by clicking **Add users**.
6. If you are using an _existing_ IAM user, click the **Add permissions** dropdown menu and select **Add permissions**. If you are creating a _new_ user, you will be taken to the Permissions screen after selecting a name.
7. Select **Attach policies directly**, then find and check the box for your new policy. Click **Next**, then **Add permissions**.
8. After successfully creating your user, select the **Security credentials** tab and click **Create access key**. You will be prompted to select a use case and add optional tags to your access key. Click **Create access key** to generate the keys.

#### Option 1: Using an IAM Role (Most secure)

1. In the IAM dashboard, click **Roles**, then **Create role**. <!-- env:oss -->
2. Choose the appropriate trust entity and attach the policy you created.
3. Set up a trust relationship for the role. For example for **AWS account** trusted entity use default AWS account on your instance (it will be used to assume role). To use **External ID** set it to environment variables as `export AWS_EXTERNAL_ID="{your-external-id}"`. Edit the trust relationship policy to reflect this:
```
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::{your-aws-account-id}:user/{your-username}"
},
"Action": "sts:AssumeRole",
"Condition": {
"StringEquals": {
"sts:ExternalId": "{your-external-id}"
}
}
}
]
}
```
<!-- /env:oss -->
<!-- env:cloud -->
2. Choose the **AWS account** trusted entity type.
3. Set up a trust relationship for the role. This allows the Airbyte instance's AWS account to assume this role. You will also need to specify an external ID, which is a secret key that the trusting service (Airbyte) and the trusted role (the role you're creating) both know. This ID is used to prevent the "confused deputy" problem. You can find External ID in **AWS Role ARN** property title or description. Edit the trust relationship policy to include the external ID:
```
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::{airbyte-aws-account-id}:user/{your-username}"
},
"Action": "sts:AssumeRole",
"Condition": {
"StringEquals": {
"sts:ExternalId": "{your-external-id}"
}
}
}
]
}
```
<!-- /env:cloud -->
4. Complete the role creation and note the Role ARN.

#### Option 2: Using an IAM User

1. In the IAM dashboard, click **Users**. Select an existing IAM user or create a new one by clicking **Add users**.
2. If you are using an _existing_ IAM user, click the **Add permissions** dropdown menu and select **Add permissions**. If you are creating a _new_ user, you will be taken to the Permissions screen after selecting a name.
3. Select **Attach policies directly**, then find and check the box for your new policy. Click **Next**, then **Add permissions**.
4. After successfully creating your user, select the **Security credentials** tab and click **Create access key**. You will be prompted to select a use case and add optional tags to your access key. Click **Create access key** to generate the keys.

:::caution
Your `Secret Access Key` will only be visible once upon creation. Be sure to copy and store it securely for future use.
Expand All @@ -69,7 +124,11 @@ For more information on managing your access keys, please refer to the
3. Give a **Name** to the stream
4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema).
5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below.
6. **If you are syncing from a private bucket**, you must fill the **AWS Access Key ID** and **AWS Secret Access Key** fields with the appropriate credentials to authenticate the connection. All other fields are optional and can be left empty. Refer to the [S3 Provider Settings section](#s3-provider-settings) below for more information on each field.
6. **To authenticate your private bucket**:
- If using an IAM role, enter the **AWS Role ARN**.
- If using IAM user credentials, fill the **AWS Access Key ID** and **AWS Secret Access Key** fields with the appropriate credentials.

All other fields are optional and can be left empty. Refer to the [S3 Provider Settings section](#s3-provider-settings) below for more information on each field.

## Supported sync modes

Expand Down Expand Up @@ -256,7 +315,8 @@ To perform the text extraction from PDF and Docx files, the connector uses the [

| Version | Date | Pull Request | Subject |
|:--------|:-----------|:----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------|
| 4.3.0 | 2023-12-14 | [33411](https://github.com/airbytehq/airbyte/pull/33411) | Bump CDK version to auto-set primary key for document file streams and support raw txt files |
| 4.4.0 | 2023-12-28 | [33818](https://github.com/airbytehq/airbyte/pull/33818) | Add IAM Role Authentication |
| 4.3.0 | 2023-12-14 | [33411](https://github.com/airbytehq/airbyte/pull/33411) | Bump CDK version to auto-set primary key for document file streams and support raw txt files |
| 4.2.4 | 2023-12-06 | [33187](https://github.com/airbytehq/airbyte/pull/33187) | Bump CDK version to hide source-defined primary key |
| 4.2.3 | 2023-11-16 | [32608](https://github.com/airbytehq/airbyte/pull/32608) | Improve document file type parser |
| 4.2.2 | 2023-11-20 | [32677](https://github.com/airbytehq/airbyte/pull/32677) | Only read files with ".zip" extension as zipped files |
Expand Down

0 comments on commit 2332bb4

Please sign in to comment.