diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 3746fd1..fe17278 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,7 +8,11 @@ "ghcr.io/devcontainers/features/python:1.6.3": { "version": "3.12.0" }, - "ghcr.io/devcontainers/features/aws-cli:1": {} + "ghcr.io/devcontainers/features/aws-cli:1": {}, + "ghcr.io/devcontainers/features/docker-in-docker:2.12.0": { + "version": "27.0.3", + "moby": false + } }, "postCreateCommand": "./tools/setup.sh", "shutdownAction": "stopContainer" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 14cfab8..d843665 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,6 @@ +ci: + autoupdate_schedule: monthly + default_language_version: python: python3 @@ -17,7 +20,7 @@ repos: hooks: - id: yamllint - repo: https://github.com/awslabs/cfn-python-lint - rev: v1.18.4 + rev: v1.20.1 hooks: - id: cfn-python-lint args: @@ -36,7 +39,7 @@ repos: hooks: - id: black - repo: https://github.com/sirosen/check-jsonschema - rev: 0.29.4 + rev: 0.30.0 hooks: - id: check-github-workflows - id: check-github-actions diff --git a/README.md b/README.md index fce5714..872fb23 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,17 @@ also include a Python virtual environment where all the Python packages needed are already installed. If you decide the develop outside of the dev container, some of the development -tools can be installed by running: +tools can be installed manually by running: ```console ./tools/setup.sh ``` +When developing outside the dev container, the following tools must be installed +manually. + +- [Docker](https://docs.docker.com/engine/install/) >= v27 + Development requires the activation of the Python virtual environment: ``` diff --git a/app.py b/app.py index fcaa4cc..66118de 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,5 @@ import aws_cdk as cdk +from aws_cdk.aws_scheduler_alpha import ScheduleExpression from openchallenges.bucket_stack import BucketStack from openchallenges.network_stack import NetworkStack @@ -6,7 +7,9 @@ from openchallenges.service_stack import ServiceStack from openchallenges.service_stack import LoadBalancedServiceStack from openchallenges.load_balancer_stack import LoadBalancerStack -from openchallenges.service_props import ServiceProps +from openchallenges.service_props import ServiceProps, ContainerVolume +from openchallenges.data_integration_stack import DataIntegrationStack +from openchallenges.data_integration_props import DataIntegrationProps import openchallenges.utils as utils app = cdk.App() @@ -14,7 +17,7 @@ # get the environment environment = utils.get_environment() stack_name_prefix = f"openchallenges-{environment}" -image_version = "0.0.12" +image_version = "1.1.1" # get VARS from cdk.json env_vars = app.node.try_get_context(environment) @@ -45,6 +48,12 @@ "MARIADB_PASSWORD": secrets["MARIADB_PASSWORD"], "MARIADB_ROOT_PASSWORD": secrets["MARIADB_ROOT_PASSWORD"], }, + container_volumes=[ + ContainerVolume( + path="/data/db", + size=30, + ) + ], ) mariadb_stack = ServiceStack( @@ -297,9 +306,9 @@ f"ghcr.io/sage-bionetworks/openchallenges-app:{image_version}", { "API_DOCS_URL": f"https://{fully_qualified_domain_name}/api-docs", - "APP_VERSION": "1.0.12-beta", + "APP_VERSION": image_version, "CSR_API_URL": f"https://{fully_qualified_domain_name}/api/v1", - "DATA_UPDATED_ON": "2024-11-13", + "DATA_UPDATED_ON": "2024-12-10", "ENVIRONMENT": "production", "GOOGLE_TAG_MANAGER_ID": "GTM-NBR5XD8C", "SSR_API_URL": "http://openchallenges-api-gateway:8082/api/v1", @@ -322,6 +331,20 @@ app, f"{stack_name_prefix}-load-balancer", network_stack.vpc ) +data_integration_props = DataIntegrationProps( + schedule=ScheduleExpression.cron( + minute="*/5", + hour="*", + day="*", + month="*", + time_zone=cdk.TimeZone.AMERICA_LOS_ANGELES, + ), + schedule_description="This is a cron-based schedule that will run every 5 minutes", +) +data_integration_stack = DataIntegrationStack( + app, f"{stack_name_prefix}-data-integration", data_integration_props +) + api_docs_props = ServiceProps( "openchallenges-api-docs", 8010, diff --git a/cdk_docker/data-integration-lambda/Dockerfile b/cdk_docker/data-integration-lambda/Dockerfile new file mode 100644 index 0000000..6665871 --- /dev/null +++ b/cdk_docker/data-integration-lambda/Dockerfile @@ -0,0 +1 @@ +FROM ghcr.io/sage-bionetworks/sandbox-lambda-python:sha-b38dc22 diff --git a/openchallenges/data_integration_lambda.py b/openchallenges/data_integration_lambda.py new file mode 100644 index 0000000..5a03836 --- /dev/null +++ b/openchallenges/data_integration_lambda.py @@ -0,0 +1,62 @@ +from aws_cdk import aws_iam as iam +from aws_cdk import aws_lambda as lambda_ +from constructs import Construct + + +class DataIntegrationLambda(Construct): + """ + A CDK construct to define an AWS Lambda function for data integration. + + This construct creates an IAM role with the necessary permissions and a Docker-based + Lambda function for handling data integration tasks. + """ + + def __init__(self, scope: Construct, id: str) -> None: + """ + Builds the IAM role for the Lambda function. + + This role allows the Lambda function to execute basic AWS operations. + + Returns: + iam.Role: The IAM role for the Lambda function. + """ + super().__init__(scope, id) + + self.lambda_role = self._build_lambda_role() + self.lambda_function = self._build_lambda_function(self.lambda_role) + + def _build_lambda_role(self) -> iam.Role: + return iam.Role( + self, + "LambdaRole", + assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + managed_policy_name=("service-role/AWSLambdaBasicExecutionRole") + ) + ], + ) + + def _build_lambda_function(self, role: iam.Role) -> lambda_.Function: + """ + Builds the Docker-based AWS Lambda function. + + The Lambda function uses a Docker image built from a local directory. + + Args: + role (iam.Role): The IAM role to associate with the Lambda function. + + Returns: + _lambda.Function: The Docker-based AWS Lambda function. + """ + return lambda_.DockerImageFunction( + self, + "LambdaFunction", + code=lambda_.DockerImageCode.from_image_asset( + # Directory relative to where you execute cdk deploy contains a + # Dockerfile with build instructions. + directory="cdk_docker/data-integration-lambda" + ), + role=role, + memory_size=128, + ) diff --git a/openchallenges/data_integration_props.py b/openchallenges/data_integration_props.py new file mode 100644 index 0000000..59416e4 --- /dev/null +++ b/openchallenges/data_integration_props.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass +from aws_cdk.aws_scheduler_alpha import ScheduleExpression + + +@dataclass +class DataIntegrationProps: + """ + Data integration properties. + + Attributes: + schedule (ScheduleExpression): The schedule for triggering the data integration. + schedule_description (str): The description of the schedule. + """ + + schedule: ScheduleExpression + """The schedule for triggering the data integration.""" + + schedule_description: str + """The description of the schedule.""" diff --git a/openchallenges/data_integration_stack.py b/openchallenges/data_integration_stack.py new file mode 100644 index 0000000..5b87a76 --- /dev/null +++ b/openchallenges/data_integration_stack.py @@ -0,0 +1,66 @@ +import aws_cdk as cdk +from aws_cdk import ( + aws_scheduler_alpha as scheduler_alpha, + aws_scheduler_targets_alpha as scheduler_targets, +) +from openchallenges.data_integration_lambda import DataIntegrationLambda +from openchallenges.data_integration_props import DataIntegrationProps +from constructs import Construct + + +class DataIntegrationStack(cdk.Stack): + """ + Defines an AWS CDK stack for data integration. + + This stack sets up the resources required for scheduling and executing + data integration tasks using AWS Lambda and EventBridge Scheduler. + + The stack includes: + - A Lambda function for data integration. + - An EventBridge Scheduler schedule to trigger the Lambda function. + - An EventBridge Scheduler group for organizing schedules. + + Attributes: + scope (Construct): The parent construct. + id (str): The unique identifier for this stack. + props (DataIntegrationProps): The properties for the data integration, including the schedule. + """ + + def __init__( + self, scope: Construct, id: str, props: DataIntegrationProps, **kwargs + ) -> None: + """ + Initializes the DataIntegrationStack. + + Arguments: + scope (Construct): The parent construct for this stack. + id (str): The unique identifier for this stack. + props (DataIntegrationProps): The properties required for data integration, + including the schedule. + **kwargs: Additional arguments passed to the base `cdk.Stack` class. + """ + super().__init__(scope, id, **kwargs) + + data_integration_lambda = DataIntegrationLambda(self, "data-integration-lambda") + + target = scheduler_targets.LambdaInvoke( + data_integration_lambda.lambda_function, + input=scheduler_alpha.ScheduleTargetInput.from_object({}), + ) + + # Create a group for the schedule (maybe we want to add more schedules + # to this group the future) + schedule_group = scheduler_alpha.Group( + self, + "group", + group_name="schedule-group", + ) + + scheduler_alpha.Schedule( + self, + "schedule", + schedule=props.schedule, + target=target, + group=schedule_group, + description=props.schedule_description, + ) diff --git a/openchallenges/service_props.py b/openchallenges/service_props.py index efa7316..25b16c9 100644 --- a/openchallenges/service_props.py +++ b/openchallenges/service_props.py @@ -1,6 +1,25 @@ +from dataclasses import dataclass +from typing import List + CONTAINER_LOCATION_PATH_ID = "path://" +@dataclass +class ContainerVolume: + """ + Holds onto configuration for a volume used in the container. + + Attributes: + path: The path on the container to mount the host volume at. + size: The size of the volume in GiB. + read_only: Container has read-only access to the volume, set to `false` for write access. + """ + + path: str + size: int = 15 + read_only: bool = False + + class ServiceProps: """ ECS service properties @@ -13,6 +32,7 @@ class ServiceProps: supports docker registry references (i.e. ghcr.io/sage-bionetworks/openchallenges-thumbor:latest) container_env_vars: a json dictionary of environment variables to pass into the container i.e. {"EnvA": "EnvValueA", "EnvB": "EnvValueB"} + container_volumes: List of `ContainerVolume` resources to mount into the container """ def __init__( @@ -22,6 +42,7 @@ def __init__( container_memory: int, container_location: str, container_env_vars: dict, + container_volumes: List[ContainerVolume] = None, ) -> None: self.container_name = container_name self.container_port = container_port @@ -32,3 +53,7 @@ def __init__( ) self.container_location = container_location self.container_env_vars = container_env_vars + if container_volumes is None: + self.container_volumes = [] + else: + self.container_volumes = container_volumes diff --git a/openchallenges/service_stack.py b/openchallenges/service_stack.py index 3911a7e..e1be848 100644 --- a/openchallenges/service_stack.py +++ b/openchallenges/service_stack.py @@ -60,6 +60,28 @@ def __init__( ) ) + # default ECS execution policy plus Guardduty access + execution_role = iam.Role( + self, + "ExecutionRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + "service-role/AmazonECSTaskExecutionRolePolicy" + ), + ], + ) + execution_role.add_to_policy( + iam.PolicyStatement( + actions=[ + "logs:CreateLogStream", + "logs:PutLogEvents", + ], + resources=["*"], + effect=iam.Effect.ALLOW, + ) + ) + # ECS task with fargate self.task_definition = ecs.FargateTaskDefinition( self, @@ -67,6 +89,7 @@ def __init__( cpu=1024, memory_limit_mib=4096, task_role=task_role, + execution_role=execution_role, ) image = ecs.ContainerImage.from_registry(props.container_location) @@ -118,14 +141,14 @@ def __init__( ), ) - # mount volume for DB - if "mariadb" in construct_id: - self.volume = ecs.ServiceManagedVolume( + # mount volumes + for container_volume in props.container_volumes: + service_volume = ecs.ServiceManagedVolume( self, - "ServiceVolume", + "ContainerVolume", name=props.container_name, managed_ebs_volume=ecs.ServiceManagedEBSVolumeConfiguration( - size=size.gibibytes(30), + size=size.gibibytes(container_volume.size), volume_type=ec2.EbsDeviceVolumeType.GP3, ), ) @@ -133,13 +156,12 @@ def __init__( self.task_definition.add_volume( name=props.container_name, configured_at_launch=True ) - self.service.add_volume(self.volume) + self.service.add_volume(service_volume) - self.volume.mount_in( - # should be mounted at openchallenges-mariadb:/data/db + service_volume.mount_in( self.container, - container_path="/data/db", - read_only=False, + container_path=container_volume.path, + read_only=container_volume.read_only, ) diff --git a/requirements.txt b/requirements.txt index 92c10ed..b1c51a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ aws-cdk-lib==2.139.0 +aws-cdk.aws-scheduler-alpha==2.139.0a0 +aws-cdk.aws-scheduler-targets-alpha==2.139.0a0 constructs>=10.0.0,<11.0.0 boto3>=1.34.1 diff --git a/source.bat b/source.bat deleted file mode 100644 index 9e1a834..0000000 --- a/source.bat +++ /dev/null @@ -1,13 +0,0 @@ -@echo off - -rem The sole purpose of this script is to make the command -rem -rem source .venv/bin/activate -rem -rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. -rem On Windows, this command just runs this batch file (the argument is ignored). -rem -rem Now we don't need to document a Windows command for activating a virtualenv. - -echo Executing .venv\Scripts\activate.bat for you -.venv\Scripts\activate.bat