From 5570c0c70ed889d1e4eeb362c88aa98b8229f152 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 12:39:35 +0100 Subject: [PATCH 01/50] Create Sagemaker pipeline schedules if specified --- .../orchestrators/sagemaker_orchestrator.py | 146 ++++++++++++++---- 1 file changed, 112 insertions(+), 34 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index f832647a97..6564414df4 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -15,6 +15,7 @@ import os import re +from datetime import datetime from typing import ( TYPE_CHECKING, Any, @@ -245,12 +246,8 @@ def prepare_or_run_pipeline( Yields: A dictionary of metadata related to the pipeline run. """ - if deployment.schedule: - logger.warning( - "The Sagemaker Orchestrator currently does not support the " - "use of schedules. The `schedule` will be ignored " - "and the pipeline will be run immediately." - ) + # Get the session and client + session = self._get_sagemaker_session() # sagemaker requires pipelineName to use alphanum and hyphens only unsanitized_orchestrator_run_name = get_orchestrator_run_name( @@ -459,7 +456,7 @@ def prepare_or_run_pipeline( sagemaker_steps.append(sagemaker_step) - # construct the pipeline from the sagemaker_steps + # Create the pipeline pipeline = Pipeline( name=orchestrator_run_name, steps=sagemaker_steps, @@ -479,38 +476,119 @@ def prepare_or_run_pipeline( if settings.pipeline_tags else None, ) - execution = pipeline.start() - logger.warning( - "Steps can take 5-15 minutes to start running " - "when using the Sagemaker Orchestrator." - ) - # Yield metadata based on the generated execution object - yield from self.compute_metadata( - execution=execution, settings=settings - ) + # Handle scheduling if specified + if deployment.schedule: + if settings.synchronous: + logger.warning( + "The 'synchronous' setting is ignored for scheduled pipelines since " + "they run independently of the deployment process." + ) - # mainly for testing purposes, we wait for the pipeline to finish - if settings.synchronous: - logger.info( - "Executing synchronously. Waiting for pipeline to finish... \n" - "At this point you can `Ctrl-C` out without cancelling the " - "execution." + events_client = session.boto_session.client("events") + rule_name = f"zenml-{deployment.pipeline_configuration.name}" + + # Determine first execution time based on schedule type + if deployment.schedule.cron_expression: + schedule_expr = f"cron({deployment.schedule.cron_expression})" + next_execution = ( + None # Exact time calculation would require cron parsing + ) + elif deployment.schedule.interval_second: + minutes = ( + deployment.schedule.interval_second.total_seconds() / 60 + ) + schedule_expr = f"rate({int(minutes)} minutes)" + next_execution = ( + datetime.utcnow() + deployment.schedule.interval_second + ) + elif deployment.schedule.run_once_start_time: + schedule_expr = f"at({deployment.schedule.run_once_start_time.strftime('%Y-%m-%dT%H:%M:%S')})" + next_execution = deployment.schedule.run_once_start_time + + events_client.put_rule( + Name=rule_name, + ScheduleExpression=schedule_expr, + State="ENABLED" + ) + + # Add the SageMaker pipeline as target + events_client.put_targets( + Rule=rule_name, + Targets=[ + { + "Id": f"zenml-target-{deployment.pipeline_configuration.name}", + "Arn": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", + "RoleArn": self.config.execution_role, + } + ], ) - try: - execution.wait( - delay=POLLING_DELAY, max_attempts=MAX_POLLING_ATTEMPTS + + logger.info( + f"Successfully scheduled pipeline with rule: {rule_name}\n" + f"Schedule type: {schedule_expr}\n" + + ( + f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} UTC" + if next_execution + else f"Using cron expression: {deployment.schedule.cron_expression}" ) - logger.info("Pipeline completed successfully.") - except WaiterError: - raise RuntimeError( - "Timed out while waiting for pipeline execution to " - "finish. For long-running pipelines we recommend " - "configuring your orchestrator for asynchronous execution. " - "The following command does this for you: \n" - f"`zenml orchestrator update {self.name} " - f"--synchronous=False`" + + ( + f" (and every {int(minutes)} minutes after)" + if deployment.schedule.interval_second + else "" ) + ) + + # Yield metadata about the schedule + yield { + "schedule_rule_name": rule_name, + "schedule_type": ( + "cron" + if deployment.schedule.cron_expression + else "rate" + if deployment.schedule.interval_second + else "one-time" + ), + "schedule_expression": schedule_expr, + "pipeline_name": orchestrator_run_name, + "next_execution_time": next_execution.isoformat() + if next_execution + else None, + } + else: + # Execute the pipeline immediately if no schedule is specified + execution = pipeline.start() + logger.warning( + "Steps can take 5-15 minutes to start running " + "when using the Sagemaker Orchestrator." + ) + + # Yield metadata based on the generated execution object + yield from self.compute_metadata( + execution=execution, settings=settings + ) + + # mainly for testing purposes, we wait for the pipeline to finish + if settings.synchronous: + logger.info( + "Executing synchronously. Waiting for pipeline to finish... \n" + "At this point you can `Ctrl-C` out without cancelling the " + "execution." + ) + try: + execution.wait( + delay=POLLING_DELAY, max_attempts=MAX_POLLING_ATTEMPTS + ) + logger.info("Pipeline completed successfully.") + except WaiterError: + raise RuntimeError( + "Timed out while waiting for pipeline execution to " + "finish. For long-running pipelines we recommend " + "configuring your orchestrator for asynchronous execution. " + "The following command does this for you: \n" + f"`zenml orchestrator update {self.name} " + f"--synchronous=False`" + ) def get_pipeline_run_metadata( self, run_id: UUID From 6f7bf288a177fa1bf070f0abf65956e1086741a3 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 13:31:33 +0100 Subject: [PATCH 02/50] Add property to check if orchestrator is schedulable --- .../flavors/sagemaker_orchestrator_flavor.py | 9 +++++++ .../orchestrators/sagemaker_orchestrator.py | 26 ++++++++++--------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py index 2898f24cc6..ea17571d77 100644 --- a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py +++ b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py @@ -132,6 +132,15 @@ class SagemakerOrchestratorSettings(BaseSettings): ("processor_role", "execution_role"), ("processor_tags", "tags") ) + @property + def is_schedulable(self) -> bool: + """Whether the orchestrator is schedulable or not. + + Returns: + Whether the orchestrator is schedulable or not. + """ + return True + @model_validator(mode="before") def validate_model(cls, data: Dict[str, Any]) -> Dict[str, Any]: """Check if model is configured correctly. diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 6564414df4..2d75dfe8b4 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -490,22 +490,24 @@ def prepare_or_run_pipeline( # Determine first execution time based on schedule type if deployment.schedule.cron_expression: - schedule_expr = f"cron({deployment.schedule.cron_expression})" - next_execution = ( - None # Exact time calculation would require cron parsing - ) + # AWS EventBridge requires cron expressions in format: cron(0 12 * * ? *) + # Strip any "cron(" prefix if it exists + cron_exp = deployment.schedule.cron_expression.replace("cron(", "").replace(")", "") + schedule_expr = f"cron({cron_exp})" + next_execution = None elif deployment.schedule.interval_second: - minutes = ( - deployment.schedule.interval_second.total_seconds() / 60 - ) - schedule_expr = f"rate({int(minutes)} minutes)" - next_execution = ( - datetime.utcnow() + deployment.schedule.interval_second - ) + minutes = max(1, int(deployment.schedule.interval_second.total_seconds() / 60)) + schedule_expr = f"rate({minutes} minutes)" + next_execution = datetime.utcnow() + deployment.schedule.interval_second elif deployment.schedule.run_once_start_time: - schedule_expr = f"at({deployment.schedule.run_once_start_time.strftime('%Y-%m-%dT%H:%M:%S')})" + # Format for specific date/time: cron(Minutes Hours Day-of-month Month ? Year) + # Example: cron(0 12 1 1 ? 2024) + dt = deployment.schedule.run_once_start_time + schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" next_execution = deployment.schedule.run_once_start_time + logger.info(f"Creating EventBridge rule with schedule expression: {schedule_expr}") + events_client.put_rule( Name=rule_name, ScheduleExpression=schedule_expr, From 0212388d2db16523350abdd2d868bbfac898b15a Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 13:54:05 +0100 Subject: [PATCH 03/50] Add EventBridge rule for SageMaker pipeline execution --- .../orchestrators/sagemaker_orchestrator.py | 94 +++++++++++++++++-- 1 file changed, 88 insertions(+), 6 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 2d75dfe8b4..b0f2cef247 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -13,6 +13,7 @@ # permissions and limitations under the License. """Implementation of the SageMaker orchestrator.""" +import json import os import re from datetime import datetime @@ -492,13 +493,92 @@ def prepare_or_run_pipeline( if deployment.schedule.cron_expression: # AWS EventBridge requires cron expressions in format: cron(0 12 * * ? *) # Strip any "cron(" prefix if it exists - cron_exp = deployment.schedule.cron_expression.replace("cron(", "").replace(")", "") + cron_exp = deployment.schedule.cron_expression.replace( + "cron(", "" + ).replace(")", "") schedule_expr = f"cron({cron_exp})" next_execution = None elif deployment.schedule.interval_second: - minutes = max(1, int(deployment.schedule.interval_second.total_seconds() / 60)) + minutes = max( + 1, + int( + deployment.schedule.interval_second.total_seconds() + / 60 + ), + ) + schedule_expr = f"rate({minutes} minutes)" + next_execution = ( + datetime.utcnow() + deployment.schedule.interval_second + ) + elif deployment.schedule.run_once_start_time: + # Format for specific date/time: cron(Minutes Hours Day-of-month Month ? Year) + # Example: cron(0 12 1 1 ? 2024) + dt = deployment.schedule.run_once_start_time + schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" + next_execution = deployment.schedule.run_once_start_time + + logger.info( + f"Creating EventBridge rule with schedule expression: {schedule_expr}" + ) + + # Create IAM policy for EventBridge to trigger SageMaker pipeline + iam_client = session.boto_session.client("iam") + + # Create the policy document + policy_document = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["sagemaker:StartPipelineExecution"], + "Resource": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", + } + ], + } + + # Create or update the role policy + try: + role_name = self.config.execution_role.split("/")[ + -1 + ] # Extract role name from ARN + policy_name = f"zenml-eventbridge-{orchestrator_run_name}" + + iam_client.put_role_policy( + RoleName=role_name, + PolicyName=policy_name, + PolicyDocument=json.dumps(policy_document), + ) + + logger.info(f"Created/Updated IAM policy: {policy_name}") + except Exception as e: + logger.error(f"Failed to create/update IAM policy: {e}") + raise + + # Create the EventBridge rule + events_client = session.boto_session.client("events") + rule_name = f"zenml-{deployment.pipeline_configuration.name}" + + # Determine first execution time based on schedule type + if deployment.schedule.cron_expression: + # AWS EventBridge requires cron expressions in format: cron(0 12 * * ? *) + # Strip any "cron(" prefix if it exists + cron_exp = deployment.schedule.cron_expression.replace( + "cron(", "" + ).replace(")", "") + schedule_expr = f"cron({cron_exp})" + next_execution = None + elif deployment.schedule.interval_second: + minutes = max( + 1, + int( + deployment.schedule.interval_second.total_seconds() + / 60 + ), + ) schedule_expr = f"rate({minutes} minutes)" - next_execution = datetime.utcnow() + deployment.schedule.interval_second + next_execution = ( + datetime.utcnow() + deployment.schedule.interval_second + ) elif deployment.schedule.run_once_start_time: # Format for specific date/time: cron(Minutes Hours Day-of-month Month ? Year) # Example: cron(0 12 1 1 ? 2024) @@ -506,15 +586,17 @@ def prepare_or_run_pipeline( schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" next_execution = deployment.schedule.run_once_start_time - logger.info(f"Creating EventBridge rule with schedule expression: {schedule_expr}") + logger.info( + f"Creating EventBridge rule with schedule expression: {schedule_expr}" + ) events_client.put_rule( Name=rule_name, ScheduleExpression=schedule_expr, - State="ENABLED" + State="ENABLED", ) - # Add the SageMaker pipeline as target + # Add the SageMaker pipeline as target with the role events_client.put_targets( Rule=rule_name, Targets=[ From 21253127c5abc668594e04ea6d109a9a32228767 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 15:05:35 +0100 Subject: [PATCH 04/50] Update IAM policy and trust relationship for EventBridge --- .../orchestrators/sagemaker_orchestrator.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index b0f2cef247..d0bd3a1fb0 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -521,10 +521,13 @@ def prepare_or_run_pipeline( f"Creating EventBridge rule with schedule expression: {schedule_expr}" ) - # Create IAM policy for EventBridge to trigger SageMaker pipeline + # Create IAM policy and trust relationship for EventBridge iam_client = session.boto_session.client("iam") + role_name = self.config.execution_role.split("/")[ + -1 + ] # Extract role name from ARN - # Create the policy document + # Create the policy document (existing) policy_document = { "Version": "2012-10-17", "Statement": [ @@ -536,22 +539,41 @@ def prepare_or_run_pipeline( ], } - # Create or update the role policy + # Create the trust relationship document + trust_relationship = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "events.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + try: - role_name = self.config.execution_role.split("/")[ - -1 - ] # Extract role name from ARN + # Update the role policy (existing) policy_name = f"zenml-eventbridge-{orchestrator_run_name}" - iam_client.put_role_policy( RoleName=role_name, PolicyName=policy_name, PolicyDocument=json.dumps(policy_document), ) - logger.info(f"Created/Updated IAM policy: {policy_name}") + + # Update the trust relationship + iam_client.update_assume_role_policy( + RoleName=role_name, + PolicyDocument=json.dumps(trust_relationship), + ) + logger.info( + f"Updated trust relationship for role: {role_name}" + ) + except Exception as e: - logger.error(f"Failed to create/update IAM policy: {e}") + logger.error( + f"Failed to update IAM policy or trust relationship: {e}" + ) raise # Create the EventBridge rule From 966f71223582373f648ca75c77c921c11bd41e9f Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 15:10:49 +0100 Subject: [PATCH 05/50] Refactor schedule metadata generation for Sagemaker orchestrator --- .../orchestrators/sagemaker_orchestrator.py | 62 ++++++++++++++----- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index d0bd3a1fb0..7c864770a8 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -646,21 +646,21 @@ def prepare_or_run_pipeline( ) # Yield metadata about the schedule - yield { - "schedule_rule_name": rule_name, - "schedule_type": ( - "cron" - if deployment.schedule.cron_expression - else "rate" - if deployment.schedule.interval_second - else "one-time" - ), - "schedule_expression": schedule_expr, - "pipeline_name": orchestrator_run_name, - "next_execution_time": next_execution.isoformat() - if next_execution - else None, - } + schedule_type = ( + "cron" + if deployment.schedule.cron_expression + else "rate" + if deployment.schedule.interval_second + else "one-time" + ) + + yield self.compute_schedule_metadata( + rule_name=rule_name, + schedule_expr=schedule_expr, + pipeline_name=orchestrator_run_name, + next_execution=next_execution, + schedule_type=schedule_type, + ) else: # Execute the pipeline immediately if no schedule is specified execution = pipeline.start() @@ -896,3 +896,35 @@ def _compute_orchestrator_run_id( f"There was an issue while extracting the pipeline run ID: {e}" ) return None + + def compute_schedule_metadata( + self, + rule_name: str, + schedule_expr: str, + pipeline_name: str, + next_execution: Optional[datetime], + schedule_type: str, + ) -> Dict[str, MetadataType]: + """Generate metadata for scheduled pipeline executions. + + Args: + rule_name: The name of the EventBridge rule + schedule_expr: The schedule expression (cron or rate) + pipeline_name: Name of the SageMaker pipeline + next_execution: Next scheduled execution time + schedule_type: Type of schedule (cron/rate/one-time) + + Returns: + A dictionary of metadata related to the schedule. + """ + metadata: Dict[str, MetadataType] = { + "schedule_rule_name": rule_name, + "schedule_type": schedule_type, + "schedule_expression": schedule_expr, + "pipeline_name": pipeline_name, + } + + if next_execution: + metadata["next_execution_time"] = next_execution.isoformat() + + return metadata From dd7110c9fa3d3e7866a956facdb51a41b79720c7 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 15:17:54 +0100 Subject: [PATCH 06/50] Add scheduling support for SageMaker orchestrator --- .../component-guide/orchestrators/custom.md | 2 +- .../orchestrators/sagemaker.md | 100 +++++++++++++++++- .../build-pipelines/schedule-a-pipeline.md | 2 +- 3 files changed, 97 insertions(+), 7 deletions(-) diff --git a/docs/book/component-guide/orchestrators/custom.md b/docs/book/component-guide/orchestrators/custom.md index 539aecdd6b..0aa68ac8b0 100644 --- a/docs/book/component-guide/orchestrators/custom.md +++ b/docs/book/component-guide/orchestrators/custom.md @@ -141,7 +141,7 @@ To see a full end-to-end worked example of a custom orchestrator, [see here](htt There are some additional optional features that your orchestrator can implement: -* **Running pipelines on a schedule**: if your orchestrator supports running pipelines on a schedule, make sure to handle `deployment.schedule` if it exists. If your orchestrator does not support schedules, you should either log a warning and or even raise an exception in case the user tries to schedule a pipeline. +* **Running pipelines on a schedule**: if your orchestrator supports running pipelines on a schedule, make sure to handle `deployment.schedule` if it exists. If your orchestrator ules, you should either log a warning and or even raise an exception in case the user tries to schedule a pipeline. * **Specifying hardware resources**: If your orchestrator supports setting resources like CPUs, GPUs or memory for the pipeline or specific steps, make sure to handle the values defined in `step.config.resource_settings`. See the code sample below for additional helper methods to check whether any resources are required from your orchestrator. ### Code sample diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 6464333934..74dfcdc335 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -153,10 +153,6 @@ Alternatively, for a more detailed view of log messages during SageMaker pipelin ![SageMaker CloudWatch Logs](../../.gitbook/assets/sagemaker-cloudwatch-logs.png) -### Run pipelines on a schedule - -The ZenML Sagemaker orchestrator doesn't currently support running pipelines on a schedule. We maintain a public roadmap for ZenML, which you can find [here](https://zenml.io/roadmap). We welcome community contributions (see more [here](https://github.com/zenml-io/zenml/blob/main/CONTRIBUTING.md)) so if you want to enable scheduling for Sagemaker, please [do let us know](https://zenml.io/slack)! - ### Configuration at pipeline or step level When running your ZenML pipeline with the Sagemaker orchestrator, the configuration set when configuring the orchestrator as a ZenML component will be used by default. However, it is possible to provide additional configuration at the pipeline or step level. This allows you to run whole pipelines or individual steps with alternative configurations. For example, this allows you to run the training process with a heavier, GPU-enabled instance type, while running other steps with lighter instances. @@ -339,4 +335,98 @@ This approach allows for more granular tagging, giving you flexibility in how yo Note that if you wish to use this orchestrator to run steps on a GPU, you will need to follow [the instructions on this page](../../how-to/pipeline-development/training-with-gpus/README.md) to ensure that it works. It requires adding some extra settings customization and is essential to enable CUDA for the GPU to give its full acceleration. -
ZenML Scarf
+### Scheduling Pipelines + +The SageMaker orchestrator supports running pipelines on a schedule using AWS EventBridge. You can configure schedules in three ways: + +* Using a cron expression +* Using a fixed interval +* Running once at a specific time + +```python +from zenml import pipeline +from datetime import datetime, timedelta + +# Using a cron expression (runs daily at 2 AM UTC) +@pipeline(schedule=Schedule(cron_expression="0 2 * * *")) +def my_scheduled_pipeline(): + # Your pipeline steps here + pass + +# Using an interval (runs every 2 hours) +@pipeline(schedule=Schedule(interval_second=timedelta(hours=2))) +def my_interval_pipeline(): + # Your pipeline steps here + pass + +# Running once at a specific time +@pipeline(schedule=Schedule(run_once_start_time=datetime(2024, 12, 31, 23, 59))) +def my_one_time_pipeline(): + # Your pipeline steps here + pass +``` + +When you deploy a scheduled pipeline, ZenML will: +1. Create an EventBridge rule with the specified schedule +2. Configure the necessary IAM permissions +3. Set up the SageMaker pipeline as the target + +#### Required IAM Permissions + +When using scheduled pipelines, you need to ensure your IAM role has the correct permissions and trust relationships: + +1. Trust Relationships +Your execution role needs to trust both SageMaker and EventBridge services. Add this trust relationship to your role: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "sagemaker.amazonaws.com", + "events.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] +} +``` + +2. Required IAM Policies +In addition to the basic SageMaker permissions, you'll need: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "events:PutRule", + "events:PutTargets", + "events:DeleteRule", + "events:RemoveTargets", + "events:DescribeRule", + "events:ListTargetsByRule" + ], + "Resource": "arn:aws:events:*:*:rule/zenml-*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "iam:GetRolePolicy", + "iam:PutRolePolicy", + "iam:UpdateAssumeRolePolicy" + ], + "Resource": "arn:aws:iam::*:role/*" + } + ] +} +``` + +
ZenML Scarf
\ No newline at end of file diff --git a/docs/book/how-to/pipeline-development/build-pipelines/schedule-a-pipeline.md b/docs/book/how-to/pipeline-development/build-pipelines/schedule-a-pipeline.md index be725e386f..f922339393 100644 --- a/docs/book/how-to/pipeline-development/build-pipelines/schedule-a-pipeline.md +++ b/docs/book/how-to/pipeline-development/build-pipelines/schedule-a-pipeline.md @@ -18,7 +18,7 @@ Schedules don't work for all orchestrators. Here is a list of all supported orch | [KubernetesOrchestrator](../../../component-guide/orchestrators/kubernetes.md) | ✅ | | [LocalOrchestrator](../../../component-guide/orchestrators/local.md) | ⛔️ | | [LocalDockerOrchestrator](../../../component-guide/orchestrators/local-docker.md) | ⛔️ | -| [SagemakerOrchestrator](../../../component-guide/orchestrators/sagemaker.md) | ⛔️ | +| [SagemakerOrchestrator](../../../component-guide/orchestrators/sagemaker.md) | ✅ | | [SkypilotAWSOrchestrator](../../../component-guide/orchestrators/skypilot-vm.md) | ⛔️ | | [SkypilotAzureOrchestrator](../../../component-guide/orchestrators/skypilot-vm.md) | ⛔️ | | [SkypilotGCPOrchestrator](../../../component-guide/orchestrators/skypilot-vm.md) | ⛔️ | From fcf934e67b5d3536b573965e177da66b0f63ff50 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 15:21:54 +0100 Subject: [PATCH 07/50] Remove trust relationship logic in Sagemaker orchestrator --- .../orchestrators/sagemaker_orchestrator.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 7c864770a8..0efb112fe7 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -539,18 +539,6 @@ def prepare_or_run_pipeline( ], } - # Create the trust relationship document - trust_relationship = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": {"Service": "events.amazonaws.com"}, - "Action": "sts:AssumeRole", - } - ], - } - try: # Update the role policy (existing) policy_name = f"zenml-eventbridge-{orchestrator_run_name}" @@ -561,15 +549,6 @@ def prepare_or_run_pipeline( ) logger.info(f"Created/Updated IAM policy: {policy_name}") - # Update the trust relationship - iam_client.update_assume_role_policy( - RoleName=role_name, - PolicyDocument=json.dumps(trust_relationship), - ) - logger.info( - f"Updated trust relationship for role: {role_name}" - ) - except Exception as e: logger.error( f"Failed to update IAM policy or trust relationship: {e}" From 358b3e811c22c1ecec51cd7550afd278d63096f6 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 15:25:34 +0100 Subject: [PATCH 08/50] Handle unsupported schedule in custom orchestrator --- docs/book/component-guide/orchestrators/custom.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/book/component-guide/orchestrators/custom.md b/docs/book/component-guide/orchestrators/custom.md index 0aa68ac8b0..539aecdd6b 100644 --- a/docs/book/component-guide/orchestrators/custom.md +++ b/docs/book/component-guide/orchestrators/custom.md @@ -141,7 +141,7 @@ To see a full end-to-end worked example of a custom orchestrator, [see here](htt There are some additional optional features that your orchestrator can implement: -* **Running pipelines on a schedule**: if your orchestrator supports running pipelines on a schedule, make sure to handle `deployment.schedule` if it exists. If your orchestrator ules, you should either log a warning and or even raise an exception in case the user tries to schedule a pipeline. +* **Running pipelines on a schedule**: if your orchestrator supports running pipelines on a schedule, make sure to handle `deployment.schedule` if it exists. If your orchestrator does not support schedules, you should either log a warning and or even raise an exception in case the user tries to schedule a pipeline. * **Specifying hardware resources**: If your orchestrator supports setting resources like CPUs, GPUs or memory for the pipeline or specific steps, make sure to handle the values defined in `step.config.resource_settings`. See the code sample below for additional helper methods to check whether any resources are required from your orchestrator. ### Code sample From 7ce08b63d2418e8311db475b857f7a5664d64751 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 16:40:16 +0100 Subject: [PATCH 09/50] Refactor yield statement to use 'yield from' syntax --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 0efb112fe7..72bfa14c9e 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -633,7 +633,7 @@ def prepare_or_run_pipeline( else "one-time" ) - yield self.compute_schedule_metadata( + yield from self.compute_schedule_metadata( rule_name=rule_name, schedule_expr=schedule_expr, pipeline_name=orchestrator_run_name, From 72bdae1aababf3bb64eb6639128263b767aea33b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 16:47:53 +0100 Subject: [PATCH 10/50] Ensure IAM permissions for scheduled SageMaker pipelines --- .../orchestrators/sagemaker.md | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 74dfcdc335..ecac6529c8 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -373,10 +373,10 @@ When you deploy a scheduled pipeline, ZenML will: #### Required IAM Permissions -When using scheduled pipelines, you need to ensure your IAM role has the correct permissions and trust relationships: +When using scheduled pipelines, you need to ensure your IAM role has the correct permissions and trust relationships. Here's a detailed breakdown of why each permission is needed: -1. Trust Relationships -Your execution role needs to trust both SageMaker and EventBridge services. Add this trust relationship to your role: +1. **Trust Relationships** +Your execution role needs to trust both SageMaker and EventBridge services to allow them to assume the role: ```json { @@ -386,8 +386,8 @@ Your execution role needs to trust both SageMaker and EventBridge services. Add "Effect": "Allow", "Principal": { "Service": [ - "sagemaker.amazonaws.com", - "events.amazonaws.com" + "sagemaker.amazonaws.com", // Required for SageMaker execution + "events.amazonaws.com" // Required for EventBridge to trigger pipelines ] }, "Action": "sts:AssumeRole" @@ -396,8 +396,8 @@ Your execution role needs to trust both SageMaker and EventBridge services. Add } ``` -2. Required IAM Policies -In addition to the basic SageMaker permissions, you'll need: +2. **Required IAM Policies** +In addition to the basic SageMaker permissions, the AWS credentials used by the service connector (or provided directly to the orchestrator) need the following permissions to create and manage scheduled pipelines: ```json { @@ -406,22 +406,22 @@ In addition to the basic SageMaker permissions, you'll need: { "Effect": "Allow", "Action": [ - "events:PutRule", - "events:PutTargets", - "events:DeleteRule", - "events:RemoveTargets", - "events:DescribeRule", - "events:ListTargetsByRule" + "events:PutRule", // Required to create schedule rules + "events:PutTargets", // Required to set pipeline as target + "events:DeleteRule", // Required for cleanup + "events:RemoveTargets", // Required for cleanup + "events:DescribeRule", // Required to verify rule creation + "events:ListTargetsByRule" // Required to verify target setup ], "Resource": "arn:aws:events:*:*:rule/zenml-*" }, { "Effect": "Allow", "Action": [ - "iam:GetRole", - "iam:GetRolePolicy", - "iam:PutRolePolicy", - "iam:UpdateAssumeRolePolicy" + "iam:GetRole", // Required to verify role exists + "iam:GetRolePolicy", // Required to check existing policies + "iam:PutRolePolicy", // Required to add new policies + "iam:UpdateAssumeRolePolicy" // Required to update trust relationships ], "Resource": "arn:aws:iam::*:role/*" } @@ -429,4 +429,12 @@ In addition to the basic SageMaker permissions, you'll need: } ``` +These permissions enable: +* Creation and management of EventBridge rules for scheduling +* Setting up trust relationships between services +* Managing IAM policies required for the scheduled execution +* Cleanup of resources when schedules are removed + +Without these permissions, the scheduling functionality will fail with access denied errors. +
ZenML Scarf
\ No newline at end of file From abf46101d4de831931002e2cbbbf87237efb7cc9 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 16:48:00 +0100 Subject: [PATCH 11/50] Update authentication instructions for SageMaker orchestrator --- docs/book/component-guide/orchestrators/sagemaker.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index ecac6529c8..2da138af2d 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -59,7 +59,7 @@ There are three ways you can authenticate your orchestrator and link it to the I {% tabs %} {% tab title="Authentication via Service Connector" %} -The recommended way to authenticate your SageMaker orchestrator is by registering an [AWS Service Connector](../../how-to/infrastructure-deployment/auth-management/aws-service-connector.md) and connecting it to your SageMaker orchestrator: +The recommended way to authenticate your SageMaker orchestrator is by registering an [AWS Service Connector](../../how-to/infrastructure-deployment/auth-management/aws-service-connector.md) and connecting it to your SageMaker orchestrator. If you plan to use scheduled pipelines, ensure the credentials used by the service connector have the necessary EventBridge and IAM permissions listed in the [Required IAM Permissions](#required-iam-permissions) section: ```shell zenml service-connector register --type aws -i @@ -72,7 +72,7 @@ zenml stack register -o ... --set {% endtab %} {% tab title="Explicit Authentication" %} -Instead of creating a service connector, you can also configure your AWS authentication credentials directly in the orchestrator: +Instead of creating a service connector, you can also configure your AWS authentication credentials directly in the orchestrator. If you plan to use scheduled pipelines, ensure these credentials have the necessary EventBridge and IAM permissions listed in the [Required IAM Permissions](#required-iam-permissions) section: ```shell zenml orchestrator register \ @@ -88,7 +88,7 @@ See the [`SagemakerOrchestratorConfig` SDK Docs](https://sdkdocs.zenml.io/latest {% endtab %} {% tab title="Implicit Authentication" %} -If you neither connect your orchestrator to a service connector nor configure credentials explicitly, ZenML will try to implicitly authenticate to AWS via the `default` profile in your local [AWS configuration file](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). +If you neither connect your orchestrator to a service connector nor configure credentials explicitly, ZenML will try to implicitly authenticate to AWS via the `default` profile in your local [AWS configuration file](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). If you plan to use scheduled pipelines, ensure this profile has the necessary EventBridge and IAM permissions listed in the [Required IAM Permissions](#required-iam-permissions) section: ```shell zenml orchestrator register \ From 67705c261eeae0789a516d39066a7605ea46ad63 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 21:35:39 +0100 Subject: [PATCH 12/50] Refactor Sagemaker orchestrator metadata handling --- .../orchestrators/sagemaker_orchestrator.py | 142 ++++++++++++------ 1 file changed, 99 insertions(+), 43 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 72bfa14c9e..1abae0806d 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -521,7 +521,7 @@ def prepare_or_run_pipeline( f"Creating EventBridge rule with schedule expression: {schedule_expr}" ) - # Create IAM policy and trust relationship for EventBridge + # Create IAM policy for EventBridge iam_client = session.boto_session.client("iam") role_name = self.config.execution_role.split("/")[ -1 @@ -633,12 +633,17 @@ def prepare_or_run_pipeline( else "one-time" ) - yield from self.compute_schedule_metadata( - rule_name=rule_name, - schedule_expr=schedule_expr, - pipeline_name=orchestrator_run_name, - next_execution=next_execution, - schedule_type=schedule_type, + schedule_metadata = { + "rule_name": rule_name, + "schedule_type": schedule_type, + "schedule_expr": schedule_expr, + "pipeline_name": orchestrator_run_name, + "next_execution": next_execution, + } + + yield from self.compute_metadata( + execution=schedule_metadata, + settings=settings, ) else: # Execute the pipeline immediately if no schedule is specified @@ -757,7 +762,7 @@ def compute_metadata( """Generate run metadata based on the generated Sagemaker Execution. Args: - execution: The corresponding _PipelineExecution object. + execution: The corresponding _PipelineExecution object or schedule metadata dict. settings: The Sagemaker orchestrator settings. Yields: @@ -766,19 +771,40 @@ def compute_metadata( # Metadata metadata: Dict[str, MetadataType] = {} - # Orchestrator Run ID - if run_id := self._compute_orchestrator_run_id(execution): - metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id + # Handle schedule metadata if execution is a dict + if isinstance(execution, dict): + metadata.update( + { + "schedule_rule_name": execution["rule_name"], + "schedule_type": execution["schedule_type"], + "schedule_expression": execution["schedule_expr"], + "pipeline_name": execution["pipeline_name"], + } + ) + + if next_execution := execution.get("next_execution"): + metadata["next_execution_time"] = next_execution.isoformat() + + # Add orchestrator metadata using the same pattern as execution metadata + if orchestrator_url := self._compute_schedule_url(execution): + metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) - # URL to the Sagemaker's pipeline view - if orchestrator_url := self._compute_orchestrator_url(execution): - metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) + if logs_url := self._compute_schedule_logs_url( + execution, settings + ): + metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) + else: + # Handle execution metadata + if run_id := self._compute_orchestrator_run_id(execution): + metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id + + if orchestrator_url := self._compute_orchestrator_url(execution): + metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) - # URL to the corresponding CloudWatch page - if logs_url := self._compute_orchestrator_logs_url( - execution, settings - ): - metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) + if logs_url := self._compute_orchestrator_logs_url( + execution, settings + ): + metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) yield metadata @@ -876,34 +902,64 @@ def _compute_orchestrator_run_id( ) return None - def compute_schedule_metadata( - self, - rule_name: str, - schedule_expr: str, - pipeline_name: str, - next_execution: Optional[datetime], - schedule_type: str, - ) -> Dict[str, MetadataType]: - """Generate metadata for scheduled pipeline executions. + @staticmethod + def _compute_schedule_url(schedule_info: Dict[str, Any]) -> Optional[str]: + """Generate the SageMaker Console URL for a scheduled pipeline. Args: - rule_name: The name of the EventBridge rule - schedule_expr: The schedule expression (cron or rate) - pipeline_name: Name of the SageMaker pipeline - next_execution: Next scheduled execution time - schedule_type: Type of schedule (cron/rate/one-time) + schedule_info: Dictionary containing schedule information. Returns: - A dictionary of metadata related to the schedule. + The URL to the pipeline in the SageMaker console. """ - metadata: Dict[str, MetadataType] = { - "schedule_rule_name": rule_name, - "schedule_type": schedule_type, - "schedule_expression": schedule_expr, - "pipeline_name": pipeline_name, - } + try: + # Get the Sagemaker session + session = boto3.Session(region_name=schedule_info["region"]) + sagemaker_client = session.client("sagemaker") + + # List the Studio domains and get the Studio Domain ID + domains_response = sagemaker_client.list_domains() + studio_domain_id = domains_response["Domains"][0]["DomainId"] + + return ( + f"https://studio-{studio_domain_id}.studio.{schedule_info['region']}." + f"sagemaker.aws/pipelines/view/{schedule_info['pipeline_name']}" + ) + except Exception as e: + logger.warning( + f"There was an issue while extracting the pipeline url: {e}" + ) + return None + + @staticmethod + def _compute_schedule_logs_url( + schedule_info: Dict[str, Any], + settings: SagemakerOrchestratorSettings, + ) -> Optional[str]: + """Generate the CloudWatch URL for a scheduled pipeline. - if next_execution: - metadata["next_execution_time"] = next_execution.isoformat() + Args: + schedule_info: Dictionary containing schedule information. + settings: The Sagemaker orchestrator settings. - return metadata + Returns: + The URL to query the pipeline logs in CloudWatch. + """ + try: + use_training_jobs = True + if settings.use_training_step is not None: + use_training_jobs = settings.use_training_step + + job_type = "Training" if use_training_jobs else "Processing" + + return ( + f"https://{schedule_info['region']}.console.aws.amazon.com/" + f"cloudwatch/home?region={schedule_info['region']}#logsV2:" + f"log-groups/log-group/$252Faws$252Fsagemaker$252F{job_type}Jobs" + f"$3FlogStreamNameFilter$3Dpipelines-" + ) + except Exception as e: + logger.warning( + f"There was an issue while extracting the logs url: {e}" + ) + return None From d190f31a3056173b5716199d46f3ea457035f8a0 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 21:43:33 +0100 Subject: [PATCH 13/50] Add unit tests for SageMaker orchestrator metadata --- .../test_sagemaker_orchestrator.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py index 788dcb0506..8a92768cba 100644 --- a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py +++ b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py @@ -13,8 +13,22 @@ # permissions and limitations under the License. +from datetime import datetime, timedelta +from unittest.mock import MagicMock, patch + +from zenml.constants import ( + METADATA_ORCHESTRATOR_LOGS_URL, + METADATA_ORCHESTRATOR_RUN_ID, + METADATA_ORCHESTRATOR_URL, +) from zenml.enums import StackComponentType from zenml.integrations.aws.flavors import SagemakerOrchestratorFlavor +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings, +) +from zenml.integrations.aws.orchestrators.sagemaker_orchestrator import ( + SagemakerOrchestrator, +) def test_sagemaker_orchestrator_flavor_attributes(): @@ -23,3 +37,110 @@ def test_sagemaker_orchestrator_flavor_attributes(): flavor = SagemakerOrchestratorFlavor() assert flavor.type == StackComponentType.ORCHESTRATOR assert flavor.name == "sagemaker" + + +def test_compute_schedule_metadata(): + """Tests that schedule metadata is computed correctly.""" + # Setup + orchestrator = SagemakerOrchestrator( + name="test_orchestrator", + id="test-id", + config={}, + flavor="sagemaker", + type="orchestrator", + user="test-user", + workspace="test-workspace", + created="2023-01-01", + updated="2023-01-01", + ) + settings = SagemakerOrchestratorSettings() + + # Mock schedule info + next_execution = datetime.utcnow() + timedelta(hours=1) + schedule_info = { + "rule_name": "test-rule", + "schedule_type": "rate", + "schedule_expr": "rate(1 hour)", + "pipeline_name": "test-pipeline", + "next_execution": next_execution, + "region": "us-west-2", + "account_id": "123456789012", + } + + # Mock boto3 session and SageMaker client + mock_sagemaker_client = MagicMock() + mock_sagemaker_client.list_domains.return_value = { + "Domains": [{"DomainId": "d-test123"}] + } + + with patch("boto3.Session") as mock_session: + mock_session.return_value.client.return_value = mock_sagemaker_client + + # Get metadata + metadata = next( + orchestrator.compute_metadata( + execution=schedule_info, + settings=settings, + ) + ) + + # Verify schedule-specific metadata + assert metadata["schedule_rule_name"] == "test-rule" + assert metadata["schedule_type"] == "rate" + assert metadata["schedule_expression"] == "rate(1 hour)" + assert metadata["pipeline_name"] == "test-pipeline" + assert metadata["next_execution_time"] == next_execution.isoformat() + + # Verify orchestrator metadata + assert metadata[METADATA_ORCHESTRATOR_URL] == ( + "https://studio-d-test123.studio.us-west-2.sagemaker.aws/pipelines/view/test-pipeline" + ) + assert metadata[METADATA_ORCHESTRATOR_LOGS_URL].startswith( + "https://us-west-2.console.aws.amazon.com/cloudwatch/home" + ) + + +def test_compute_schedule_metadata_error_handling(): + """Tests error handling in schedule metadata computation.""" + orchestrator = SagemakerOrchestrator( + name="test_orchestrator", + id="test-id", + config={}, + flavor="sagemaker", + type="orchestrator", + user="test-user", + workspace="test-workspace", + created="2023-01-01", + updated="2023-01-01", + ) + settings = SagemakerOrchestratorSettings() + + # Invalid schedule info missing required fields + schedule_info = { + "rule_name": "test-rule", + "schedule_type": "rate", # Add minimum required fields + "schedule_expr": "rate(1 hour)", + "pipeline_name": "test-pipeline", + } + + with patch("boto3.Session") as mock_session: + mock_session.side_effect = Exception("Failed to create session") + + # Get metadata - should not raise exception + metadata = next( + orchestrator.compute_metadata( + execution=schedule_info, + settings=settings, + ) + ) + + # Basic metadata should still be present + assert metadata["schedule_rule_name"] == "test-rule" + assert metadata["schedule_type"] == "rate" + assert metadata["schedule_expression"] == "rate(1 hour)" + assert metadata["pipeline_name"] == "test-pipeline" + + # URLs should be None due to error + assert METADATA_ORCHESTRATOR_RUN_ID not in metadata + assert METADATA_ORCHESTRATOR_URL not in metadata + assert METADATA_ORCHESTRATOR_LOGS_URL not in metadata From f1cabc726cf307b1d346884973fcb910ba7085e4 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 21:45:04 +0100 Subject: [PATCH 14/50] Add exception handling for pipeline preparation errors --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 1abae0806d..eb86affa36 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -243,6 +243,7 @@ def prepare_or_run_pipeline( `boto3.Session` object. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. + Exception: If there is an error during pipeline preparation or execution. Yields: A dictionary of metadata related to the pipeline run. From 726b47aa02f46b991bbe266c0dd0e8cda6e72ca2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:06:00 +0100 Subject: [PATCH 15/50] Add timezone information to first execution message --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index eb86affa36..69a8b4a830 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -614,7 +614,8 @@ def prepare_or_run_pipeline( f"Successfully scheduled pipeline with rule: {rule_name}\n" f"Schedule type: {schedule_expr}\n" + ( - f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} UTC" + f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " + f"({next_execution.astimezone().tzinfo})" if next_execution else f"Using cron expression: {deployment.schedule.cron_expression}" ) From 80c8e8e422e5e2aa7573cd8f8b898cccccedc750 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:11:54 +0100 Subject: [PATCH 16/50] Add timezone support to AWS SageMaker orchestrator --- .../orchestrators/sagemaker_orchestrator.py | 46 +++++++++++++++---- .../test_sagemaker_orchestrator.py | 9 ++-- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 69a8b4a830..711c7a8156 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -16,7 +16,7 @@ import json import os import re -from datetime import datetime +from datetime import datetime, timezone from typing import ( TYPE_CHECKING, Any, @@ -512,14 +512,28 @@ def prepare_or_run_pipeline( datetime.utcnow() + deployment.schedule.interval_second ) elif deployment.schedule.run_once_start_time: - # Format for specific date/time: cron(Minutes Hours Day-of-month Month ? Year) - # Example: cron(0 12 1 1 ? 2024) - dt = deployment.schedule.run_once_start_time + # Convert local time to UTC for EventBridge + dt = deployment.schedule.run_once_start_time.astimezone( + timezone.utc + ) schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" next_execution = deployment.schedule.run_once_start_time logger.info( - f"Creating EventBridge rule with schedule expression: {schedule_expr}" + f"Creating EventBridge rule with schedule expression: {schedule_expr}\n" + f"Note: AWS EventBridge schedules are always executed in UTC timezone.\n" + + ( + f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " + f"({next_execution.astimezone().tzinfo}) / " + f"{next_execution.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} (UTC)" + if next_execution + else f"Using cron expression: {deployment.schedule.cron_expression}" + ) + + ( + f" (and every {int(minutes)} minutes after)" + if deployment.schedule.interval_second + else "" + ) ) # Create IAM policy for EventBridge @@ -582,14 +596,28 @@ def prepare_or_run_pipeline( datetime.utcnow() + deployment.schedule.interval_second ) elif deployment.schedule.run_once_start_time: - # Format for specific date/time: cron(Minutes Hours Day-of-month Month ? Year) - # Example: cron(0 12 1 1 ? 2024) - dt = deployment.schedule.run_once_start_time + # Convert local time to UTC for EventBridge + dt = deployment.schedule.run_once_start_time.astimezone( + timezone.utc + ) schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" next_execution = deployment.schedule.run_once_start_time logger.info( - f"Creating EventBridge rule with schedule expression: {schedule_expr}" + f"Creating EventBridge rule with schedule expression: {schedule_expr}\n" + f"Note: AWS EventBridge schedules are always executed in UTC timezone.\n" + + ( + f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " + f"({next_execution.astimezone().tzinfo}) / " + f"{next_execution.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} (UTC)" + if next_execution + else f"Using cron expression: {deployment.schedule.cron_expression}" + ) + + ( + f" (and every {int(minutes)} minutes after)" + if deployment.schedule.interval_second + else "" + ) ) events_client.put_rule( diff --git a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py index 8a92768cba..5cc06497a2 100644 --- a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py +++ b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, patch from zenml.constants import ( @@ -55,8 +55,8 @@ def test_compute_schedule_metadata(): ) settings = SagemakerOrchestratorSettings() - # Mock schedule info - next_execution = datetime.utcnow() + timedelta(hours=1) + # Mock schedule info with timezone-aware datetime in UTC + next_execution = datetime.now(timezone.utc) + timedelta(hours=1) schedule_info = { "rule_name": "test-rule", "schedule_type": "rate", @@ -91,6 +91,9 @@ def test_compute_schedule_metadata(): assert metadata["pipeline_name"] == "test-pipeline" assert metadata["next_execution_time"] == next_execution.isoformat() + # Verify that boto3 Session was created with correct region + mock_session.assert_called_once_with(region_name="us-west-2") + # Verify orchestrator metadata assert metadata[METADATA_ORCHESTRATOR_URL] == ( "https://studio-d-test123.studio.us-west-2.sagemaker.aws/pipelines/view/test-pipeline" From da8fd35018f8607a8110ef769777183e9507e67b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:19:31 +0100 Subject: [PATCH 17/50] Update error handling in SagemakerOrchestrator --- .../aws/orchestrators/sagemaker_orchestrator.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 711c7a8156..11ee935313 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -31,7 +31,7 @@ import boto3 import sagemaker -from botocore.exceptions import WaiterError +from botocore.exceptions import BotoCoreError, ClientError, WaiterError from sagemaker.network import NetworkConfig from sagemaker.processing import ProcessingInput, ProcessingOutput from sagemaker.workflow.execution_variables import ExecutionVariables @@ -243,7 +243,9 @@ def prepare_or_run_pipeline( `boto3.Session` object. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. - Exception: If there is an error during pipeline preparation or execution. + KeyError: If required fields are missing from schedule_info. + ClientError: If there's an AWS API error + BotoCoreError: If there's an error in the AWS SDK Yields: A dictionary of metadata related to the pipeline run. @@ -564,9 +566,15 @@ def prepare_or_run_pipeline( ) logger.info(f"Created/Updated IAM policy: {policy_name}") - except Exception as e: + except (ClientError, BotoCoreError) as e: logger.error( - f"Failed to update IAM policy or trust relationship: {e}" + f"Failed to update IAM policy: {e}. " + f"Please ensure you have sufficient IAM permissions." + ) + raise + except KeyError as e: + logger.error( + f"Missing required field for IAM policy creation: {e}" ) raise From ab0c06dcf3fd1d8ec7e924d7c7672e2a37b15e76 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:21:42 +0100 Subject: [PATCH 18/50] Update error handling messages for AWS in Sagemaker orchestrator --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 11ee935313..29e0389494 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -244,8 +244,8 @@ def prepare_or_run_pipeline( TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. KeyError: If required fields are missing from schedule_info. - ClientError: If there's an AWS API error - BotoCoreError: If there's an error in the AWS SDK + ClientError: If there's an AWS API error. + BotoCoreError: If there's an error in the AWS SDK. Yields: A dictionary of metadata related to the pipeline run. From 3c12d4594d0d853e046bf714bfccbb1a462e93c6 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:37:35 +0100 Subject: [PATCH 19/50] Refactor error handling in SagemakerOrchestrator --- .../aws/orchestrators/sagemaker_orchestrator.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 29e0389494..367d2b060a 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -243,9 +243,6 @@ def prepare_or_run_pipeline( `boto3.Session` object. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. - KeyError: If required fields are missing from schedule_info. - ClientError: If there's an AWS API error. - BotoCoreError: If there's an error in the AWS SDK. Yields: A dictionary of metadata related to the pipeline run. @@ -567,16 +564,17 @@ def prepare_or_run_pipeline( logger.info(f"Created/Updated IAM policy: {policy_name}") except (ClientError, BotoCoreError) as e: - logger.error( + logger.warning( f"Failed to update IAM policy: {e}. " - f"Please ensure you have sufficient IAM permissions." + f"Please ensure your execution role has sufficient permissions " + f"to start pipeline executions." ) - raise except KeyError as e: - logger.error( - f"Missing required field for IAM policy creation: {e}" + logger.warning( + f"Missing required field for IAM policy creation: {e}. " + f"Please ensure your execution role has sufficient permissions " + f"to start pipeline executions." ) - raise # Create the EventBridge rule events_client = session.boto_session.client("events") From 51a499a884f4dc3467d5f825bfffc1376ee89ece Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:43:11 +0100 Subject: [PATCH 20/50] Handle insufficient permissions creating EventBridge rules --- .../orchestrators/sagemaker_orchestrator.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 367d2b060a..5c9b0f54a5 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -240,7 +240,8 @@ def prepare_or_run_pipeline( Raises: RuntimeError: If a connector is used that does not return a - `boto3.Session` object. + `boto3.Session` object, or if there are insufficient permissions + to create EventBridge rules. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. @@ -626,23 +627,28 @@ def prepare_or_run_pipeline( ) ) - events_client.put_rule( - Name=rule_name, - ScheduleExpression=schedule_expr, - State="ENABLED", - ) - - # Add the SageMaker pipeline as target with the role - events_client.put_targets( - Rule=rule_name, - Targets=[ - { - "Id": f"zenml-target-{deployment.pipeline_configuration.name}", - "Arn": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", - "RoleArn": self.config.execution_role, - } - ], - ) + try: + events_client.put_rule( + Name=rule_name, + ScheduleExpression=schedule_expr, + State="ENABLED", + ) + # Add the SageMaker pipeline as target with the role + events_client.put_targets( + Rule=rule_name, + Targets=[ + { + "Id": f"zenml-target-{deployment.pipeline_configuration.name}", + "Arn": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", + "RoleArn": self.config.execution_role, + } + ], + ) + except (ClientError, BotoCoreError) as e: + raise RuntimeError( + f"Failed to create EventBridge target. Please ensure you have " + f"sufficient permissions to create and manage EventBridge targets: {str(e)}" + ) from e logger.info( f"Successfully scheduled pipeline with rule: {rule_name}\n" From 7d65a20e0fc8e7d3f3ea791d77084e74a5dda19c Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:44:09 +0100 Subject: [PATCH 21/50] Update error message for EventBridge creation failure --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 5c9b0f54a5..6e82623382 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -646,8 +646,8 @@ def prepare_or_run_pipeline( ) except (ClientError, BotoCoreError) as e: raise RuntimeError( - f"Failed to create EventBridge target. Please ensure you have " - f"sufficient permissions to create and manage EventBridge targets: {str(e)}" + f"Failed to create EventBridge rule or target. Please ensure you have " + f"sufficient permissions to create and manage EventBridge rules and targets: {str(e)}" ) from e logger.info( From e2ecaf3dbd186d2a360a5fd885d213beb96ab8c2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 22:51:10 +0100 Subject: [PATCH 22/50] Remove logging in SagemakerOrchestrator class --- .../aws/orchestrators/sagemaker_orchestrator.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 6e82623382..7413902984 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -519,23 +519,6 @@ def prepare_or_run_pipeline( schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" next_execution = deployment.schedule.run_once_start_time - logger.info( - f"Creating EventBridge rule with schedule expression: {schedule_expr}\n" - f"Note: AWS EventBridge schedules are always executed in UTC timezone.\n" - + ( - f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " - f"({next_execution.astimezone().tzinfo}) / " - f"{next_execution.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} (UTC)" - if next_execution - else f"Using cron expression: {deployment.schedule.cron_expression}" - ) - + ( - f" (and every {int(minutes)} minutes after)" - if deployment.schedule.interval_second - else "" - ) - ) - # Create IAM policy for EventBridge iam_client = session.boto_session.client("iam") role_name = self.config.execution_role.split("/")[ From a5fd82bac4a73b352c75845ef89e6acfb3c7ee0d Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Fri, 20 Dec 2024 23:02:53 +0100 Subject: [PATCH 23/50] Refactor orchestrator metadata computation logic --- .../orchestrators/sagemaker_orchestrator.py | 71 ------------------- .../test_sagemaker_orchestrator.py | 21 ------ 2 files changed, 92 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 7413902984..f751db42db 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -809,15 +809,6 @@ def compute_metadata( if next_execution := execution.get("next_execution"): metadata["next_execution_time"] = next_execution.isoformat() - - # Add orchestrator metadata using the same pattern as execution metadata - if orchestrator_url := self._compute_schedule_url(execution): - metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) - - if logs_url := self._compute_schedule_logs_url( - execution, settings - ): - metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) else: # Handle execution metadata if run_id := self._compute_orchestrator_run_id(execution): @@ -926,65 +917,3 @@ def _compute_orchestrator_run_id( f"There was an issue while extracting the pipeline run ID: {e}" ) return None - - @staticmethod - def _compute_schedule_url(schedule_info: Dict[str, Any]) -> Optional[str]: - """Generate the SageMaker Console URL for a scheduled pipeline. - - Args: - schedule_info: Dictionary containing schedule information. - - Returns: - The URL to the pipeline in the SageMaker console. - """ - try: - # Get the Sagemaker session - session = boto3.Session(region_name=schedule_info["region"]) - sagemaker_client = session.client("sagemaker") - - # List the Studio domains and get the Studio Domain ID - domains_response = sagemaker_client.list_domains() - studio_domain_id = domains_response["Domains"][0]["DomainId"] - - return ( - f"https://studio-{studio_domain_id}.studio.{schedule_info['region']}." - f"sagemaker.aws/pipelines/view/{schedule_info['pipeline_name']}" - ) - except Exception as e: - logger.warning( - f"There was an issue while extracting the pipeline url: {e}" - ) - return None - - @staticmethod - def _compute_schedule_logs_url( - schedule_info: Dict[str, Any], - settings: SagemakerOrchestratorSettings, - ) -> Optional[str]: - """Generate the CloudWatch URL for a scheduled pipeline. - - Args: - schedule_info: Dictionary containing schedule information. - settings: The Sagemaker orchestrator settings. - - Returns: - The URL to query the pipeline logs in CloudWatch. - """ - try: - use_training_jobs = True - if settings.use_training_step is not None: - use_training_jobs = settings.use_training_step - - job_type = "Training" if use_training_jobs else "Processing" - - return ( - f"https://{schedule_info['region']}.console.aws.amazon.com/" - f"cloudwatch/home?region={schedule_info['region']}#logsV2:" - f"log-groups/log-group/$252Faws$252Fsagemaker$252F{job_type}Jobs" - f"$3FlogStreamNameFilter$3Dpipelines-" - ) - except Exception as e: - logger.warning( - f"There was an issue while extracting the logs url: {e}" - ) - return None diff --git a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py index 5cc06497a2..9d4299b9fc 100644 --- a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py +++ b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py @@ -16,11 +16,6 @@ from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, patch -from zenml.constants import ( - METADATA_ORCHESTRATOR_LOGS_URL, - METADATA_ORCHESTRATOR_RUN_ID, - METADATA_ORCHESTRATOR_URL, -) from zenml.enums import StackComponentType from zenml.integrations.aws.flavors import SagemakerOrchestratorFlavor from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( @@ -91,17 +86,6 @@ def test_compute_schedule_metadata(): assert metadata["pipeline_name"] == "test-pipeline" assert metadata["next_execution_time"] == next_execution.isoformat() - # Verify that boto3 Session was created with correct region - mock_session.assert_called_once_with(region_name="us-west-2") - - # Verify orchestrator metadata - assert metadata[METADATA_ORCHESTRATOR_URL] == ( - "https://studio-d-test123.studio.us-west-2.sagemaker.aws/pipelines/view/test-pipeline" - ) - assert metadata[METADATA_ORCHESTRATOR_LOGS_URL].startswith( - "https://us-west-2.console.aws.amazon.com/cloudwatch/home" - ) - def test_compute_schedule_metadata_error_handling(): """Tests error handling in schedule metadata computation.""" @@ -142,8 +126,3 @@ def test_compute_schedule_metadata_error_handling(): assert metadata["schedule_type"] == "rate" assert metadata["schedule_expression"] == "rate(1 hour)" assert metadata["pipeline_name"] == "test-pipeline" - - # URLs should be None due to error - assert METADATA_ORCHESTRATOR_RUN_ID not in metadata - assert METADATA_ORCHESTRATOR_URL not in metadata - assert METADATA_ORCHESTRATOR_LOGS_URL not in metadata From a005e5a2c862dd1c5b4f1d352eda0dc81ccb8427 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 22 Dec 2024 20:41:38 +0100 Subject: [PATCH 24/50] Update handling of scheduled pipeline updates in SageMaker.md --- docs/book/component-guide/orchestrators/sagemaker.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 2da138af2d..0602166b8d 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -371,6 +371,10 @@ When you deploy a scheduled pipeline, ZenML will: 2. Configure the necessary IAM permissions 3. Set up the SageMaker pipeline as the target +{% hint style="info" %} +If you run the same pipeline with a schedule multiple times, the existing schedule will be updated with the new settings rather than creating a new schedule. This allows you to modify schedules by simply running the pipeline again with new schedule parameters. +{% endhint %} + #### Required IAM Permissions When using scheduled pipelines, you need to ensure your IAM role has the correct permissions and trust relationships. Here's a detailed breakdown of why each permission is needed: From 8811f3f7abb37a8c3637f2dfb99e5f9fcf1b0318 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Dec 2024 08:55:52 +0100 Subject: [PATCH 25/50] Add optional IAM permissions for policy updates --- .../orchestrators/sagemaker.md | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 0602166b8d..869c988f76 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -418,14 +418,23 @@ In addition to the basic SageMaker permissions, the AWS credentials used by the "events:ListTargetsByRule" // Required to verify target setup ], "Resource": "arn:aws:events:*:*:rule/zenml-*" - }, + } + ] +} +``` + +The following IAM permissions are optional but recommended to allow automatic policy updates for the execution role: +```json +{ + "Version": "2012-10-17", + "Statement": [ { "Effect": "Allow", "Action": [ - "iam:GetRole", // Required to verify role exists - "iam:GetRolePolicy", // Required to check existing policies - "iam:PutRolePolicy", // Required to add new policies - "iam:UpdateAssumeRolePolicy" // Required to update trust relationships + "iam:GetRole", // For verifying role exists + "iam:GetRolePolicy", // For checking existing policies + "iam:PutRolePolicy", // For adding new policies + "iam:UpdateAssumeRolePolicy" // For updating trust relationships ], "Resource": "arn:aws:iam::*:role/*" } @@ -439,6 +448,6 @@ These permissions enable: * Managing IAM policies required for the scheduled execution * Cleanup of resources when schedules are removed -Without these permissions, the scheduling functionality will fail with access denied errors. +Without the EventBridge permissions, the scheduling functionality will fail. Without the IAM permissions, you'll need to manually ensure your execution role has the necessary permissions to start pipeline executions.
ZenML Scarf
\ No newline at end of file From 44fa6c5039dc3b5751a08e781e81605c6b05a631 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 23 Dec 2024 09:16:00 +0100 Subject: [PATCH 26/50] Remove redundant code for getting SageMaker session --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index f751db42db..72066fae64 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -248,9 +248,6 @@ def prepare_or_run_pipeline( Yields: A dictionary of metadata related to the pipeline run. """ - # Get the session and client - session = self._get_sagemaker_session() - # sagemaker requires pipelineName to use alphanum and hyphens only unsanitized_orchestrator_run_name = get_orchestrator_run_name( pipeline_name=deployment.pipeline_configuration.name From 3146b579792e201d82340ee1581cbeac5582a7ac Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 14:58:51 +0100 Subject: [PATCH 27/50] Add pipeline scheduler role and handling for scheduling errors --- .../flavors/sagemaker_orchestrator_flavor.py | 2 + .../orchestrators/sagemaker_orchestrator.py | 234 ++++++------------ 2 files changed, 81 insertions(+), 155 deletions(-) diff --git a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py index ea17571d77..2e94f53398 100644 --- a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py +++ b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py @@ -193,6 +193,7 @@ class SagemakerOrchestratorConfig( Attributes: execution_role: The IAM role ARN to use for the pipeline. + scheduler_role: The IAM role ARN to use for the scheduler. aws_access_key_id: The AWS access key ID to use to authenticate to AWS. If not provided, the value from the default AWS config will be used. aws_secret_access_key: The AWS secret access key to use to authenticate @@ -212,6 +213,7 @@ class SagemakerOrchestratorConfig( """ execution_role: str + scheduler_role: Optional[str] = None aws_access_key_id: Optional[str] = SecretField(default=None) aws_secret_access_key: Optional[str] = SecretField(default=None) aws_profile: Optional[str] = None diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 72066fae64..3cec88e1ba 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -13,7 +13,6 @@ # permissions and limitations under the License. """Implementation of the SageMaker orchestrator.""" -import json import os import re from datetime import datetime, timezone @@ -31,12 +30,13 @@ import boto3 import sagemaker -from botocore.exceptions import BotoCoreError, ClientError, WaiterError +from botocore.exceptions import WaiterError from sagemaker.network import NetworkConfig from sagemaker.processing import ProcessingInput, ProcessingOutput from sagemaker.workflow.execution_variables import ExecutionVariables from sagemaker.workflow.pipeline import Pipeline from sagemaker.workflow.steps import ProcessingStep, TrainingStep +from sagemaker.workflow.triggers import PipelineSchedule from zenml.config.base_settings import BaseSettings from zenml.constants import ( @@ -239,11 +239,10 @@ def prepare_or_run_pipeline( environment. Raises: - RuntimeError: If a connector is used that does not return a - `boto3.Session` object, or if there are insufficient permissions - to create EventBridge rules. + RuntimeError: If there is an error creating or scheduling the pipeline. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. + ValueError: If the schedule is not valid. Yields: A dictionary of metadata related to the pipeline run. @@ -484,92 +483,21 @@ def prepare_or_run_pipeline( "they run independently of the deployment process." ) - events_client = session.boto_session.client("events") - rule_name = f"zenml-{deployment.pipeline_configuration.name}" + schedule_name = f"zenml-{deployment.pipeline_configuration.name}" + next_execution = None - # Determine first execution time based on schedule type + # Create PipelineSchedule based on schedule type if deployment.schedule.cron_expression: - # AWS EventBridge requires cron expressions in format: cron(0 12 * * ? *) # Strip any "cron(" prefix if it exists cron_exp = deployment.schedule.cron_expression.replace( "cron(", "" ).replace(")", "") - schedule_expr = f"cron({cron_exp})" - next_execution = None - elif deployment.schedule.interval_second: - minutes = max( - 1, - int( - deployment.schedule.interval_second.total_seconds() - / 60 - ), - ) - schedule_expr = f"rate({minutes} minutes)" - next_execution = ( - datetime.utcnow() + deployment.schedule.interval_second - ) - elif deployment.schedule.run_once_start_time: - # Convert local time to UTC for EventBridge - dt = deployment.schedule.run_once_start_time.astimezone( - timezone.utc - ) - schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" - next_execution = deployment.schedule.run_once_start_time - - # Create IAM policy for EventBridge - iam_client = session.boto_session.client("iam") - role_name = self.config.execution_role.split("/")[ - -1 - ] # Extract role name from ARN - - # Create the policy document (existing) - policy_document = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": ["sagemaker:StartPipelineExecution"], - "Resource": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", - } - ], - } - - try: - # Update the role policy (existing) - policy_name = f"zenml-eventbridge-{orchestrator_run_name}" - iam_client.put_role_policy( - RoleName=role_name, - PolicyName=policy_name, - PolicyDocument=json.dumps(policy_document), - ) - logger.info(f"Created/Updated IAM policy: {policy_name}") - - except (ClientError, BotoCoreError) as e: - logger.warning( - f"Failed to update IAM policy: {e}. " - f"Please ensure your execution role has sufficient permissions " - f"to start pipeline executions." + schedule = PipelineSchedule( + name=schedule_name, + cron=cron_exp, + start_date=deployment.schedule.start_time, + enabled=True, ) - except KeyError as e: - logger.warning( - f"Missing required field for IAM policy creation: {e}. " - f"Please ensure your execution role has sufficient permissions " - f"to start pipeline executions." - ) - - # Create the EventBridge rule - events_client = session.boto_session.client("events") - rule_name = f"zenml-{deployment.pipeline_configuration.name}" - - # Determine first execution time based on schedule type - if deployment.schedule.cron_expression: - # AWS EventBridge requires cron expressions in format: cron(0 12 * * ? *) - # Strip any "cron(" prefix if it exists - cron_exp = deployment.schedule.cron_expression.replace( - "cron(", "" - ).replace(")", "") - schedule_expr = f"cron({cron_exp})" - next_execution = None elif deployment.schedule.interval_second: minutes = max( 1, @@ -578,95 +506,91 @@ def prepare_or_run_pipeline( / 60 ), ) - schedule_expr = f"rate({minutes} minutes)" + schedule = PipelineSchedule( + name=schedule_name, + rate=(minutes, "minutes"), + start_date=deployment.schedule.start_time, + enabled=True, + ) next_execution = ( - datetime.utcnow() + deployment.schedule.interval_second + deployment.schedule.start_time or datetime.utcnow() + ) + deployment.schedule.interval_second + else: + # One-time schedule + execution_time = ( + deployment.schedule.run_once_start_time + or deployment.schedule.start_time ) - elif deployment.schedule.run_once_start_time: - # Convert local time to UTC for EventBridge - dt = deployment.schedule.run_once_start_time.astimezone( - timezone.utc + if not execution_time: + raise ValueError( + "A start time must be specified for one-time schedule execution" + ) + schedule = PipelineSchedule( + name=schedule_name, + at=execution_time.astimezone(timezone.utc), + enabled=True, ) - schedule_expr = f"cron({dt.minute} {dt.hour} {dt.day} {dt.month} ? {dt.year})" - next_execution = deployment.schedule.run_once_start_time + next_execution = execution_time - logger.info( - f"Creating EventBridge rule with schedule expression: {schedule_expr}\n" - f"Note: AWS EventBridge schedules are always executed in UTC timezone.\n" - + ( - f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " - f"({next_execution.astimezone().tzinfo}) / " - f"{next_execution.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} (UTC)" - if next_execution - else f"Using cron expression: {deployment.schedule.cron_expression}" - ) - + ( - f" (and every {int(minutes)} minutes after)" - if deployment.schedule.interval_second - else "" + # Get the current role ARN if not explicitly configured + if self.config.scheduler_role is None: + logger.info( + "No scheduler_role configured. Using service connector role to schedule pipeline." ) - ) + sts = session.boto_session.client("sts") + try: + service_connector_role_arn = sts.get_caller_identity()[ + "Arn" + ] + # If this is a user ARN, try to get the role ARN + if ":user/" in service_connector_role_arn: + logger.warning( + f"Using IAM user credentials ({service_connector_role_arn}). For production " + "environments, it's recommended to use IAM roles instead." + ) + # If this is an assumed role, extract the role ARN + elif ":assumed-role/" in service_connector_role_arn: + # Convert assumed-role ARN format to role ARN format + # From: arn:aws:sts::123456789012:assumed-role/role-name/session-name + # To: arn:aws:iam::123456789012:role/role-name + service_connector_role_arn = re.sub( + r"arn:aws:sts::(\d+):assumed-role/([^/]+)/.*", + r"arn:aws:iam::\1:role/\2", + service_connector_role_arn, + ) + except Exception: + raise RuntimeError( + "Failed to get current role ARN from service connector. This means " + "the service connector is not configured correctly to schedule sagemaker " + "pipelines. You can either fix the service connector or configure " + "`scheduler_role` explicitly in your orchestrator config." + ) + else: + service_connector_role_arn = self.config.scheduler_role - try: - events_client.put_rule( - Name=rule_name, - ScheduleExpression=schedule_expr, - State="ENABLED", - ) - # Add the SageMaker pipeline as target with the role - events_client.put_targets( - Rule=rule_name, - Targets=[ - { - "Id": f"zenml-target-{deployment.pipeline_configuration.name}", - "Arn": f"arn:aws:sagemaker:{session.boto_region_name}:{session.boto_session.client('sts').get_caller_identity()['Account']}:pipeline/{orchestrator_run_name}", - "RoleArn": self.config.execution_role, - } - ], - ) - except (ClientError, BotoCoreError) as e: - raise RuntimeError( - f"Failed to create EventBridge rule or target. Please ensure you have " - f"sufficient permissions to create and manage EventBridge rules and targets: {str(e)}" - ) from e + # Attach schedule to pipeline + triggers = pipeline.put_triggers( + triggers=[schedule], role_arn=service_connector_role_arn + ) + logger.info(f"The schedule ARN is: {triggers[0]}") logger.info( - f"Successfully scheduled pipeline with rule: {rule_name}\n" - f"Schedule type: {schedule_expr}\n" + f"Successfully scheduled pipeline with name: {schedule_name}\n" + ( - f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S')} " - f"({next_execution.astimezone().tzinfo})" + f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S UTC')}" if next_execution else f"Using cron expression: {deployment.schedule.cron_expression}" ) + ( - f" (and every {int(minutes)} minutes after)" + f" (and every {minutes} minutes after)" if deployment.schedule.interval_second else "" ) ) - - # Yield metadata about the schedule - schedule_type = ( - "cron" - if deployment.schedule.cron_expression - else "rate" - if deployment.schedule.interval_second - else "one-time" - ) - - schedule_metadata = { - "rule_name": rule_name, - "schedule_type": schedule_type, - "schedule_expr": schedule_expr, - "pipeline_name": orchestrator_run_name, - "next_execution": next_execution, - } - - yield from self.compute_metadata( - execution=schedule_metadata, - settings=settings, + logger.info( + "\n\nIn order to cancel the schedule, you can use execute the following command:\n" ) + logger.info(f"`aws events disable-rule --name {schedule_name}`") else: # Execute the pipeline immediately if no schedule is specified execution = pipeline.start() From e52109beea6cc2042a3def1b11113a83804e3617 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 15:00:26 +0100 Subject: [PATCH 28/50] Refactor Sagemaker orchestrator test methods --- .../test_sagemaker_orchestrator.py | 104 ------------------ 1 file changed, 104 deletions(-) diff --git a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py index 9d4299b9fc..523e705980 100644 --- a/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py +++ b/tests/integration/integrations/aws/orchestrators/test_sagemaker_orchestrator.py @@ -12,18 +12,8 @@ # or implied. See the License for the specific language governing # permissions and limitations under the License. - -from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock, patch - from zenml.enums import StackComponentType from zenml.integrations.aws.flavors import SagemakerOrchestratorFlavor -from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( - SagemakerOrchestratorSettings, -) -from zenml.integrations.aws.orchestrators.sagemaker_orchestrator import ( - SagemakerOrchestrator, -) def test_sagemaker_orchestrator_flavor_attributes(): @@ -32,97 +22,3 @@ def test_sagemaker_orchestrator_flavor_attributes(): flavor = SagemakerOrchestratorFlavor() assert flavor.type == StackComponentType.ORCHESTRATOR assert flavor.name == "sagemaker" - - -def test_compute_schedule_metadata(): - """Tests that schedule metadata is computed correctly.""" - # Setup - orchestrator = SagemakerOrchestrator( - name="test_orchestrator", - id="test-id", - config={}, - flavor="sagemaker", - type="orchestrator", - user="test-user", - workspace="test-workspace", - created="2023-01-01", - updated="2023-01-01", - ) - settings = SagemakerOrchestratorSettings() - - # Mock schedule info with timezone-aware datetime in UTC - next_execution = datetime.now(timezone.utc) + timedelta(hours=1) - schedule_info = { - "rule_name": "test-rule", - "schedule_type": "rate", - "schedule_expr": "rate(1 hour)", - "pipeline_name": "test-pipeline", - "next_execution": next_execution, - "region": "us-west-2", - "account_id": "123456789012", - } - - # Mock boto3 session and SageMaker client - mock_sagemaker_client = MagicMock() - mock_sagemaker_client.list_domains.return_value = { - "Domains": [{"DomainId": "d-test123"}] - } - - with patch("boto3.Session") as mock_session: - mock_session.return_value.client.return_value = mock_sagemaker_client - - # Get metadata - metadata = next( - orchestrator.compute_metadata( - execution=schedule_info, - settings=settings, - ) - ) - - # Verify schedule-specific metadata - assert metadata["schedule_rule_name"] == "test-rule" - assert metadata["schedule_type"] == "rate" - assert metadata["schedule_expression"] == "rate(1 hour)" - assert metadata["pipeline_name"] == "test-pipeline" - assert metadata["next_execution_time"] == next_execution.isoformat() - - -def test_compute_schedule_metadata_error_handling(): - """Tests error handling in schedule metadata computation.""" - orchestrator = SagemakerOrchestrator( - name="test_orchestrator", - id="test-id", - config={}, - flavor="sagemaker", - type="orchestrator", - user="test-user", - workspace="test-workspace", - created="2023-01-01", - updated="2023-01-01", - ) - settings = SagemakerOrchestratorSettings() - - # Invalid schedule info missing required fields - schedule_info = { - "rule_name": "test-rule", - "schedule_type": "rate", # Add minimum required fields - "schedule_expr": "rate(1 hour)", - "pipeline_name": "test-pipeline", - } - - with patch("boto3.Session") as mock_session: - mock_session.side_effect = Exception("Failed to create session") - - # Get metadata - should not raise exception - metadata = next( - orchestrator.compute_metadata( - execution=schedule_info, - settings=settings, - ) - ) - - # Basic metadata should still be present - assert metadata["schedule_rule_name"] == "test-rule" - assert metadata["schedule_type"] == "rate" - assert metadata["schedule_expression"] == "rate(1 hour)" - assert metadata["pipeline_name"] == "test-pipeline" From 0272c5e636b5568bdf9e0ee9117bd89c9133430b Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 15:39:16 +0100 Subject: [PATCH 29/50] Update SageMaker orchestrator for scheduled pipelines --- .../orchestrators/sagemaker.md | 91 +++++++++++-------- 1 file changed, 55 insertions(+), 36 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 869c988f76..f8ab791d36 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -337,7 +337,7 @@ Note that if you wish to use this orchestrator to run steps on a GPU, you will n ### Scheduling Pipelines -The SageMaker orchestrator supports running pipelines on a schedule using AWS EventBridge. You can configure schedules in three ways: +The SageMaker orchestrator supports running pipelines on a schedule using SageMaker's native scheduling capabilities. You can configure schedules in three ways: * Using a cron expression * Using a fixed interval @@ -367,20 +367,20 @@ def my_one_time_pipeline(): ``` When you deploy a scheduled pipeline, ZenML will: -1. Create an EventBridge rule with the specified schedule -2. Configure the necessary IAM permissions -3. Set up the SageMaker pipeline as the target +1. Create a SageMaker Pipeline Schedule with the specified configuration +2. Configure the pipeline as the target for the schedule +3. Enable automatic execution based on the schedule {% hint style="info" %} -If you run the same pipeline with a schedule multiple times, the existing schedule will be updated with the new settings rather than creating a new schedule. This allows you to modify schedules by simply running the pipeline again with new schedule parameters. +If you run the same pipeline with a schedule multiple times, the existing schedule will **not** be updated with the new settings. Rather, ZenML will create a new sagemaker pipeline and attach a new schedule to it. The user must manually delete the old pipeline and their attached schedule using the AWS CLI or API. See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) {% endhint %} #### Required IAM Permissions -When using scheduled pipelines, you need to ensure your IAM role has the correct permissions and trust relationships. Here's a detailed breakdown of why each permission is needed: +When using scheduled pipelines, you need to ensure your IAM role (either the service connector role or the configured `scheduler_role`) has the correct permissions and trust relationships: 1. **Trust Relationships** -Your execution role needs to trust both SageMaker and EventBridge services to allow them to assume the role: +Your service connector role needs to trust both SageMaker and EventBridge Scheduler services: ```json { @@ -389,9 +389,11 @@ Your execution role needs to trust both SageMaker and EventBridge services to al { "Effect": "Allow", "Principal": { + "AWS": "", ## This is the ARN of the user that is configured in the service connector + # This is the list of services that the service connector role needs to schedule pipelines "Service": [ - "sagemaker.amazonaws.com", // Required for SageMaker execution - "events.amazonaws.com" // Required for EventBridge to trigger pipelines + "sagemaker.amazonaws.com", + "scheduler.amazonaws.com" ] }, "Action": "sts:AssumeRole" @@ -401,7 +403,8 @@ Your execution role needs to trust both SageMaker and EventBridge services to al ``` 2. **Required IAM Policies** -In addition to the basic SageMaker permissions, the AWS credentials used by the service connector (or provided directly to the orchestrator) need the following permissions to create and manage scheduled pipelines: + +The scheduler role (see below) needs the following permissions to manage scheduled pipelines: ```json { @@ -410,44 +413,60 @@ In addition to the basic SageMaker permissions, the AWS credentials used by the { "Effect": "Allow", "Action": [ - "events:PutRule", // Required to create schedule rules - "events:PutTargets", // Required to set pipeline as target - "events:DeleteRule", // Required for cleanup - "events:RemoveTargets", // Required for cleanup - "events:DescribeRule", // Required to verify rule creation - "events:ListTargetsByRule" // Required to verify target setup + "scheduler:ListSchedules", + "scheduler:GetSchedule", + "scheduler:CreateSchedule", + "scheduler:UpdateSchedule", + "scheduler:DeleteSchedule" ], - "Resource": "arn:aws:events:*:*:rule/zenml-*" - } - ] -} -``` - -The following IAM permissions are optional but recommended to allow automatic policy updates for the execution role: -```json -{ - "Version": "2012-10-17", - "Statement": [ + "Resource": "*" + }, { "Effect": "Allow", - "Action": [ - "iam:GetRole", // For verifying role exists - "iam:GetRolePolicy", // For checking existing policies - "iam:PutRolePolicy", // For adding new policies - "iam:UpdateAssumeRolePolicy" // For updating trust relationships - ], - "Resource": "arn:aws:iam::*:role/*" + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::*:role/*", + "Condition": { + "StringLike": { + "iam:PassedToService": "scheduler.amazonaws.com" + } + } } ] } ``` +Or you can use the `AmazonEventBridgeSchedulerFullAccess` managed policy. + These permissions enable: -* Creation and management of EventBridge rules for scheduling +* Creation and management of Pipeline Schedules * Setting up trust relationships between services * Managing IAM policies required for the scheduled execution * Cleanup of resources when schedules are removed -Without the EventBridge permissions, the scheduling functionality will fail. Without the IAM permissions, you'll need to manually ensure your execution role has the necessary permissions to start pipeline executions. +Without these permissions, the scheduling functionality will fail. Make sure to configure them before attempting to use scheduled pipelines. + +By default, the SageMaker orchestrator will use the attached [service connector role](../../how-to/infrastructure-deployment/auth-management/aws-service-connector.md) to schedule pipelines. However, you can specify a different role to be used for scheduling by configuring the `scheduler_role` parameter: + +```python +# When registering the orchestrator +zenml orchestrator register sagemaker-orchestrator \ + --flavor=sagemaker \ + --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role + +# Or updating an existing orchestrator +zenml orchestrator update sagemaker-orchestrator \ + --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role +``` + +This is particularly useful when: +* You want to use different roles for creating pipelines and scheduling them +* Your organization's security policies require separate roles for different operations +* You need to grant specific permissions only to the scheduling operations + +If no `scheduler_role` is configured, the orchestrator will: +1. Use the service connector's role +2. Log an informative message about using the service connector role +3. Handle both user credentials and assumed role scenarios appropriately +
ZenML Scarf
\ No newline at end of file From 68266132e4112e92dcdaea513a67bcf7448c8c70 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 9 Jan 2025 14:41:42 +0000 Subject: [PATCH 30/50] Auto-update of LLM Finetuning template --- examples/llm_finetuning/steps/promote.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llm_finetuning/steps/promote.py b/examples/llm_finetuning/steps/promote.py index 9d5b2e40db..30333e0ad8 100644 --- a/examples/llm_finetuning/steps/promote.py +++ b/examples/llm_finetuning/steps/promote.py @@ -54,11 +54,11 @@ def promote( msg = ( f"`{metric}` values to compare:\n" - f"base={base_metrics[metric]*100:.2f}%\n" - f"finetuned={ft_metrics[metric]*100:.2f}%" + f"base={base_metrics[metric] * 100:.2f}%\n" + f"finetuned={ft_metrics[metric] * 100:.2f}%" ) if staging_metrics: - msg += f"\nstaging={staging_metrics[metric]*100:.2f}%" + msg += f"\nstaging={staging_metrics[metric] * 100:.2f}%" logger.info(msg) if base_metrics[metric] <= ft_metrics[metric]: From dcfb03e8584f44637ed6afe04cda40536ecda9b4 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 9 Jan 2025 14:42:13 +0000 Subject: [PATCH 31/50] Auto-update of Starter template --- examples/mlops_starter/steps/model_promoter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mlops_starter/steps/model_promoter.py b/examples/mlops_starter/steps/model_promoter.py index ca73c472f4..0c570488d2 100644 --- a/examples/mlops_starter/steps/model_promoter.py +++ b/examples/mlops_starter/steps/model_promoter.py @@ -44,7 +44,7 @@ def model_promoter(accuracy: float, stage: str = "production") -> bool: if accuracy < 0.8: logger.info( - f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + f"Model accuracy {accuracy * 100:.2f}% is below 80% ! Not promoting model." ) else: logger.info(f"Model promoted to {stage}!") From 87ef96b343b3fcb0c37a6d40e8755f2c24781069 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 15:45:56 +0100 Subject: [PATCH 32/50] Add comment about rounding up to 1 minute for SageMaker --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 3cec88e1ba..49826c91ad 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -499,6 +499,10 @@ def prepare_or_run_pipeline( enabled=True, ) elif deployment.schedule.interval_second: + # This is necessary because SageMaker's PipelineSchedule rate expressions + # require minutes as the minimum time unit. + # Even if a user specifies an interval of less than 60 seconds, + # it will be rounded up to 1 minute. minutes = max( 1, int( From eb96d953f07ecbf0f4c78c78c569d0d012a7f9d0 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 9 Jan 2025 14:49:14 +0000 Subject: [PATCH 33/50] Auto-update of E2E template --- examples/e2e/steps/deployment/deployment_deploy.py | 10 ++++------ examples/e2e/steps/training/model_evaluator.py | 8 ++++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/e2e/steps/deployment/deployment_deploy.py b/examples/e2e/steps/deployment/deployment_deploy.py index 2895d35fcb..07395edd5d 100644 --- a/examples/e2e/steps/deployment/deployment_deploy.py +++ b/examples/e2e/steps/deployment/deployment_deploy.py @@ -34,12 +34,10 @@ @step -def deployment_deploy() -> ( - Annotated[ - Optional[MLFlowDeploymentService], - ArtifactConfig(name="mlflow_deployment", is_deployment_artifact=True), - ] -): +def deployment_deploy() -> Annotated[ + Optional[MLFlowDeploymentService], + ArtifactConfig(name="mlflow_deployment", is_deployment_artifact=True), +]: """Predictions step. This is an example of a predictions step that takes the data in and returns diff --git a/examples/e2e/steps/training/model_evaluator.py b/examples/e2e/steps/training/model_evaluator.py index 64fac77295..60f4ded534 100644 --- a/examples/e2e/steps/training/model_evaluator.py +++ b/examples/e2e/steps/training/model_evaluator.py @@ -82,22 +82,22 @@ def model_evaluator( dataset_trn.drop(columns=[target]), dataset_trn[target], ) - logger.info(f"Train accuracy={trn_acc*100:.2f}%") + logger.info(f"Train accuracy={trn_acc * 100:.2f}%") tst_acc = model.score( dataset_tst.drop(columns=[target]), dataset_tst[target], ) - logger.info(f"Test accuracy={tst_acc*100:.2f}%") + logger.info(f"Test accuracy={tst_acc * 100:.2f}%") mlflow.log_metric("testing_accuracy_score", tst_acc) messages = [] if trn_acc < min_train_accuracy: messages.append( - f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + f"Train accuracy {trn_acc * 100:.2f}% is below {min_train_accuracy * 100:.2f}% !" ) if tst_acc < min_test_accuracy: messages.append( - f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + f"Test accuracy {tst_acc * 100:.2f}% is below {min_test_accuracy * 100:.2f}% !" ) if fail_on_accuracy_quality_gates and messages: raise RuntimeError( From 9542ed8bb433bce7baae6e482230a61a7c5cee42 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 9 Jan 2025 14:52:15 +0000 Subject: [PATCH 34/50] Auto-update of NLP template --- .../e2e_nlp/steps/deploying/huggingface_deployment.py | 4 +++- .../e2e_nlp/steps/promotion/promote_get_metrics.py | 10 ++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/e2e_nlp/steps/deploying/huggingface_deployment.py b/examples/e2e_nlp/steps/deploying/huggingface_deployment.py index 222e813a3a..7c743ce732 100644 --- a/examples/e2e_nlp/steps/deploying/huggingface_deployment.py +++ b/examples/e2e_nlp/steps/deploying/huggingface_deployment.py @@ -39,7 +39,9 @@ def deploy_to_huggingface( """ ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### secret = Client().get_secret("huggingface_creds") - assert secret, "No secret found with name 'huggingface_creds'. Please create one that includes your `username` and `token`." + assert secret, ( + "No secret found with name 'huggingface_creds'. Please create one that includes your `username` and `token`." + ) token = secret.secret_values["token"] api = HfApi(token=token) hf_repo = api.create_repo( diff --git a/examples/e2e_nlp/steps/promotion/promote_get_metrics.py b/examples/e2e_nlp/steps/promotion/promote_get_metrics.py index 8c8220bc71..eec6eb526f 100644 --- a/examples/e2e_nlp/steps/promotion/promote_get_metrics.py +++ b/examples/e2e_nlp/steps/promotion/promote_get_metrics.py @@ -30,12 +30,10 @@ @step -def promote_get_metrics() -> ( - Tuple[ - Annotated[Dict[str, Any], "latest_metrics"], - Annotated[Dict[str, Any], "current_metrics`"], - ] -): +def promote_get_metrics() -> Tuple[ + Annotated[Dict[str, Any], "latest_metrics"], + Annotated[Dict[str, Any], "current_metrics`"], +]: """Get metrics for comparison for promoting a model. This is an example of a metric retrieval step. It is used to retrieve From 467f25b12b79539f9fdbbbee9c8c3dc1967a5085 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 16:08:28 +0100 Subject: [PATCH 35/50] Validate and format cron expression for SageMaker --- .../orchestrators/sagemaker.md | 2 +- .../orchestrators/sagemaker_orchestrator.py | 37 ++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index f8ab791d36..caafeeb22c 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -372,7 +372,7 @@ When you deploy a scheduled pipeline, ZenML will: 3. Enable automatic execution based on the schedule {% hint style="info" %} -If you run the same pipeline with a schedule multiple times, the existing schedule will **not** be updated with the new settings. Rather, ZenML will create a new sagemaker pipeline and attach a new schedule to it. The user must manually delete the old pipeline and their attached schedule using the AWS CLI or API. See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) +If you run the same pipeline with a schedule multiple times, the existing schedule will **not** be updated with the new settings. Rather, ZenML will create a new sagemaker pipeline and attach a new schedule to it. The user must manually delete the old pipeline and their attached schedule using the AWS CLI or API (`aws scheduler delete-schedule `). See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) {% endhint %} #### Required IAM Permissions diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 49826c91ad..07ed63739a 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -488,10 +488,9 @@ def prepare_or_run_pipeline( # Create PipelineSchedule based on schedule type if deployment.schedule.cron_expression: - # Strip any "cron(" prefix if it exists - cron_exp = deployment.schedule.cron_expression.replace( - "cron(", "" - ).replace(")", "") + cron_exp = self._validate_cron_expression( + deployment.schedule.cron_expression + ) schedule = PipelineSchedule( name=schedule_name, cron=cron_exp, @@ -594,7 +593,9 @@ def prepare_or_run_pipeline( logger.info( "\n\nIn order to cancel the schedule, you can use execute the following command:\n" ) - logger.info(f"`aws events disable-rule --name {schedule_name}`") + logger.info( + f"`aws scheduler delete-schedule --name {schedule_name}`" + ) else: # Execute the pipeline immediately if no schedule is specified execution = pipeline.start() @@ -842,3 +843,29 @@ def _compute_orchestrator_run_id( f"There was an issue while extracting the pipeline run ID: {e}" ) return None + + def _validate_cron_expression(self, cron_expression: str) -> str: + """Validates and formats a cron expression for SageMaker Pipeline Schedule. + + Args: + cron_expression: The cron expression to validate + + Returns: + The formatted cron expression + + Raises: + ValueError: If the cron expression is invalid + """ + # Strip any "cron(" prefix if it exists + cron_exp = cron_expression.replace("cron(", "").replace(")", "") + + # Split into components + parts = cron_exp.split() + if len(parts) not in [6, 7]: # AWS cron requires 6 or 7 fields + raise ValueError( + f"Invalid cron expression: {cron_expression}. AWS cron expressions must " + "have 6 or 7 fields: minute hour day-of-month month day-of-week year(optional). " + "Example: '15 10 ? * 6L 2022-2023'" + ) + + return cron_exp From b995970ebc501124d3c5e175b38617b6401303d8 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Thu, 9 Jan 2025 16:13:19 +0100 Subject: [PATCH 36/50] Update start time calculation for SageMaker Orchestrator --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 07ed63739a..df6c72c3cd 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -516,7 +516,8 @@ def prepare_or_run_pipeline( enabled=True, ) next_execution = ( - deployment.schedule.start_time or datetime.utcnow() + deployment.schedule.start_time + or datetime.now(timezone.utc) ) + deployment.schedule.interval_second else: # One-time schedule From a0b77568b1c9a7e8f530ccccc9ccdec45faf7d78 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 13 Jan 2025 11:30:54 +0100 Subject: [PATCH 37/50] Add metadata tracking in Sagemaker orchestrator --- .../orchestrators/sagemaker_orchestrator.py | 38 +++++++------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index df6c72c3cd..aa3fe6046b 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -643,6 +643,8 @@ def get_pipeline_run_metadata( Returns: A dictionary of metadata. """ + # TODO: Here we need to find some relevant metadata to track + # in case of a scheduled pipeline. pipeline_execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] run_metadata: Dict[str, "MetadataType"] = { "pipeline_execution_arn": pipeline_execution_arn, @@ -714,7 +716,7 @@ def compute_metadata( """Generate run metadata based on the generated Sagemaker Execution. Args: - execution: The corresponding _PipelineExecution object or schedule metadata dict. + execution: The corresponding _PipelineExecution object. settings: The Sagemaker orchestrator settings. Yields: @@ -723,31 +725,19 @@ def compute_metadata( # Metadata metadata: Dict[str, MetadataType] = {} - # Handle schedule metadata if execution is a dict - if isinstance(execution, dict): - metadata.update( - { - "schedule_rule_name": execution["rule_name"], - "schedule_type": execution["schedule_type"], - "schedule_expression": execution["schedule_expr"], - "pipeline_name": execution["pipeline_name"], - } - ) - - if next_execution := execution.get("next_execution"): - metadata["next_execution_time"] = next_execution.isoformat() - else: - # Handle execution metadata - if run_id := self._compute_orchestrator_run_id(execution): - metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id + # Orchestrator Run ID + if run_id := self._compute_orchestrator_run_id(execution): + metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id - if orchestrator_url := self._compute_orchestrator_url(execution): - metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) + # URL to the Sagemaker's pipeline view + if orchestrator_url := self._compute_orchestrator_url(execution): + metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) - if logs_url := self._compute_orchestrator_logs_url( - execution, settings - ): - metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) + # URL to the corresponding CloudWatch page + if logs_url := self._compute_orchestrator_logs_url( + execution, settings + ): + metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) yield metadata From 56b0796f8523cb672d64718aacf0de95698d5dfd Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 16 Jan 2025 13:44:04 +0100 Subject: [PATCH 38/50] checkpoint --- src/zenml/integrations/aws/__init__.py | 3 +- .../flavors/sagemaker_orchestrator_flavor.py | 18 +++---- .../orchestrators/sagemaker_orchestrator.py | 53 ++++++++++++------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/src/zenml/integrations/aws/__init__.py b/src/zenml/integrations/aws/__init__.py index c18c90f4de..f7c5abcc2d 100644 --- a/src/zenml/integrations/aws/__init__.py +++ b/src/zenml/integrations/aws/__init__.py @@ -35,12 +35,13 @@ S3_RESOURCE_TYPE = "s3-bucket" AWS_IMAGE_BUILDER_FLAVOR = "aws" + class AWSIntegration(Integration): """Definition of AWS integration for ZenML.""" NAME = AWS REQUIREMENTS = [ - "sagemaker>=2.117.0", + "sagemaker>=2.199.0", "kubernetes", "aws-profile-manager", ] diff --git a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py index 2e94f53398..26a445af8e 100644 --- a/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py +++ b/src/zenml/integrations/aws/flavors/sagemaker_orchestrator_flavor.py @@ -132,15 +132,6 @@ class SagemakerOrchestratorSettings(BaseSettings): ("processor_role", "execution_role"), ("processor_tags", "tags") ) - @property - def is_schedulable(self) -> bool: - """Whether the orchestrator is schedulable or not. - - Returns: - Whether the orchestrator is schedulable or not. - """ - return True - @model_validator(mode="before") def validate_model(cls, data: Dict[str, Any]) -> Dict[str, Any]: """Check if model is configured correctly. @@ -243,6 +234,15 @@ def is_synchronous(self) -> bool: """ return self.synchronous + @property + def is_schedulable(self) -> bool: + """Whether the orchestrator is schedulable or not. + + Returns: + Whether the orchestrator is schedulable or not. + """ + return True + class SagemakerOrchestratorFlavor(BaseOrchestratorFlavor): """Flavor for the Sagemaker orchestrator.""" diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index aa3fe6046b..69bd0052d1 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -479,8 +479,9 @@ def prepare_or_run_pipeline( if deployment.schedule: if settings.synchronous: logger.warning( - "The 'synchronous' setting is ignored for scheduled pipelines since " - "they run independently of the deployment process." + "The 'synchronous' setting is ignored for scheduled " + "pipelines since they run independently of the " + "deployment process." ) schedule_name = f"zenml-{deployment.pipeline_configuration.name}" @@ -498,8 +499,8 @@ def prepare_or_run_pipeline( enabled=True, ) elif deployment.schedule.interval_second: - # This is necessary because SageMaker's PipelineSchedule rate expressions - # require minutes as the minimum time unit. + # This is necessary because SageMaker's PipelineSchedule rate + # expressions require minutes as the minimum time unit. # Even if a user specifies an interval of less than 60 seconds, # it will be rounded up to 1 minute. minutes = max( @@ -527,7 +528,8 @@ def prepare_or_run_pipeline( ) if not execution_time: raise ValueError( - "A start time must be specified for one-time schedule execution" + "A start time must be specified for one-time " + "schedule execution" ) schedule = PipelineSchedule( name=schedule_name, @@ -549,8 +551,10 @@ def prepare_or_run_pipeline( # If this is a user ARN, try to get the role ARN if ":user/" in service_connector_role_arn: logger.warning( - f"Using IAM user credentials ({service_connector_role_arn}). For production " - "environments, it's recommended to use IAM roles instead." + f"Using IAM user credentials " + f"({service_connector_role_arn}). For production " + "environments, it's recommended to use IAM roles " + "instead." ) # If this is an assumed role, extract the role ARN elif ":assumed-role/" in service_connector_role_arn: @@ -574,16 +578,19 @@ def prepare_or_run_pipeline( # Attach schedule to pipeline triggers = pipeline.put_triggers( - triggers=[schedule], role_arn=service_connector_role_arn + triggers=[schedule], + role_arn=service_connector_role_arn, ) logger.info(f"The schedule ARN is: {triggers[0]}") logger.info( f"Successfully scheduled pipeline with name: {schedule_name}\n" + ( - f"First execution will occur at: {next_execution.strftime('%Y-%m-%d %H:%M:%S UTC')}" + f"First execution will occur at: " + f"{next_execution.strftime('%Y-%m-%d %H:%M:%S UTC')}" if next_execution - else f"Using cron expression: {deployment.schedule.cron_expression}" + else f"Using cron expression: " + f"{deployment.schedule.cron_expression}" ) + ( f" (and every {minutes} minutes after)" @@ -592,7 +599,8 @@ def prepare_or_run_pipeline( ) ) logger.info( - "\n\nIn order to cancel the schedule, you can use execute the following command:\n" + "\n\nIn order to cancel the schedule, you can use execute " + "the following command:\n" ) logger.info( f"`aws scheduler delete-schedule --name {schedule_name}`" @@ -613,7 +621,8 @@ def prepare_or_run_pipeline( # mainly for testing purposes, we wait for the pipeline to finish if settings.synchronous: logger.info( - "Executing synchronously. Waiting for pipeline to finish... \n" + "Executing synchronously. Waiting for pipeline to " + "finish... \n" "At this point you can `Ctrl-C` out without cancelling the " "execution." ) @@ -626,8 +635,8 @@ def prepare_or_run_pipeline( raise RuntimeError( "Timed out while waiting for pipeline execution to " "finish. For long-running pipelines we recommend " - "configuring your orchestrator for asynchronous execution. " - "The following command does this for you: \n" + "configuring your orchestrator for asynchronous " + "execution. The following command does this for you: \n" f"`zenml orchestrator update {self.name} " f"--synchronous=False`" ) @@ -644,7 +653,9 @@ def get_pipeline_run_metadata( A dictionary of metadata. """ # TODO: Here we need to find some relevant metadata to track - # in case of a scheduled pipeline. + # in case of a scheduled pipeline. + # Use ARN to fetch the execution + # Metadata about the schedule pipeline_execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] run_metadata: Dict[str, "MetadataType"] = { "pipeline_execution_arn": pipeline_execution_arn, @@ -835,8 +846,9 @@ def _compute_orchestrator_run_id( ) return None - def _validate_cron_expression(self, cron_expression: str) -> str: - """Validates and formats a cron expression for SageMaker Pipeline Schedule. + @staticmethod + def _validate_cron_expression(cron_expression: str) -> str: + """Validates and formats a cron expression for SageMaker schedules. Args: cron_expression: The cron expression to validate @@ -854,9 +866,10 @@ def _validate_cron_expression(self, cron_expression: str) -> str: parts = cron_exp.split() if len(parts) not in [6, 7]: # AWS cron requires 6 or 7 fields raise ValueError( - f"Invalid cron expression: {cron_expression}. AWS cron expressions must " - "have 6 or 7 fields: minute hour day-of-month month day-of-week year(optional). " - "Example: '15 10 ? * 6L 2022-2023'" + f"Invalid cron expression: {cron_expression}. AWS cron " + "expressions must have 6 or 7 fields: minute hour day-of-month " + "month day-of-week year(optional). Example: '15 10 ? * 6L " + "2022-2023'" ) return cron_exp From 1037230b822cba09d03d30a0186b58a2cb97648f Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 16 Jan 2025 14:46:17 +0100 Subject: [PATCH 39/50] docs updates --- .../component-guide/orchestrators/azureml.md | 14 +- .../orchestrators/sagemaker.md | 165 ++++++++++++------ .../component-guide/orchestrators/vertex.md | 67 ++++--- 3 files changed, 162 insertions(+), 84 deletions(-) diff --git a/docs/book/component-guide/orchestrators/azureml.md b/docs/book/component-guide/orchestrators/azureml.md index e47b4d8e9f..0cce7d75b0 100644 --- a/docs/book/component-guide/orchestrators/azureml.md +++ b/docs/book/component-guide/orchestrators/azureml.md @@ -195,10 +195,10 @@ def example_step() -> int: @pipeline(settings={"orchestrator": azureml_settings}) -def pipeline(): +def my_pipeline(): example_step() -pipeline() +my_pipeline() ``` {% hint style="info" %} @@ -213,10 +213,18 @@ its [JobSchedules](https://learn.microsoft.com/en-us/azure/machine-learning/how- Both cron expression and intervals are supported. ```python +from zenml import pipeline from zenml.config.schedule import Schedule +@pipeline +def my_pipeline(): + ... + # Run a pipeline every 5th minute -pipeline.run(schedule=Schedule(cron_expression="*/5 * * * *")) +my_pipeline = my_pipeline.with_options( + schedule=Schedule(cron_expression="*/5 * * * *") +) +my_pipeline() ``` Once you run the pipeline with a schedule, you can find the schedule and diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index caafeeb22c..a63230700b 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -166,19 +166,23 @@ Additional configuration for the Sagemaker orchestrator can be passed via `Sagem * `base_job_name` * `env` -For example, settings can be provided in the following way: +For example, settings can be provided and applied in the following way: ```python +from zenml import step +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( instance_type="ml.m5.large", volume_size_in_gb=30, ) -``` -They can then be applied to a step as follows: -```python @step(settings={"orchestrator": sagemaker_orchestrator_settings}) +def my_step() -> None: + pass ``` For example, if your ZenML component is configured to use `ml.c5.xlarge` with 400GB additional storage by default, all steps will use it except for the step above, which will use `ml.t3.medium` (for Processing Steps) or `ml.m5.xlarge` (for Training Steps) with 30GB additional storage. See the next section for details on how ZenML decides which Sagemaker Step type to use. @@ -194,6 +198,8 @@ For more information and a full list of configurable attributes of the Sagemaker To enable Warm Pools, use the [`SagemakerOrchestratorSettings`](https://sdkdocs.zenml.io/latest/integration_code_docs/integrations-aws/#zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor.SagemakerOrchestratorSettings) class: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import SagemakerOrchestratorSettings + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( keep_alive_period_in_seconds = 300, # 5 minutes, default value ) @@ -204,6 +210,8 @@ This configuration keeps instances warm for 5 minutes after each job completes, If you prefer not to use Warm Pools, you can explicitly disable them: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import SagemakerOrchestratorSettings + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( keep_alive_period_in_seconds = None, ) @@ -212,6 +220,8 @@ sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( By default, the SageMaker orchestrator uses Training Steps where possible, which can offer performance benefits and better integration with SageMaker's training capabilities. To disable this behavior: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import SagemakerOrchestratorSettings + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( use_training_step = False ) @@ -232,6 +242,10 @@ Note that data import and export can be used jointly with `processor_args` for m A simple example of importing data from S3 to the Sagemaker job is as follows: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( input_data_s3_mode="File", input_data_s3_uri="s3://some-bucket-name/folder" @@ -243,6 +257,10 @@ In this case, data will be available at `/opt/ml/processing/input/data` within t It is also possible to split your input over channels. This can be useful if the dataset is already split in S3, or maybe even located in different buckets. ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( input_data_s3_mode="File", input_data_s3_uri={ @@ -264,6 +282,10 @@ Data from within the job (e.g. produced by the training process, or when preproc In the simple case, data in `/opt/ml/processing/output/data` will be copied to S3 at the end of a job: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( output_data_s3_mode="EndOfJob", output_data_s3_uri="s3://some-results-bucket-name/results" @@ -273,6 +295,10 @@ sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( In a more complex case, data in `/opt/ml/processing/output/data/metadata` and `/opt/ml/processing/output/data/checkpoints` will be written away continuously: ```python +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) + sagemaker_orchestrator_settings = SagemakerOrchestratorSettings( output_data_s3_mode="Continuous", output_data_s3_uri={ @@ -292,7 +318,9 @@ The SageMaker orchestrator allows you to add tags to your pipeline executions an ```python from zenml import pipeline, step -from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import SagemakerOrchestratorSettings +from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( + SagemakerOrchestratorSettings +) # Define settings for the pipeline pipeline_settings = SagemakerOrchestratorSettings( @@ -344,35 +372,58 @@ The SageMaker orchestrator supports running pipelines on a schedule using SageMa * Running once at a specific time ```python -from zenml import pipeline from datetime import datetime, timedelta +from zenml import pipeline +from zenml.config.schedule import Schedule + # Using a cron expression (runs daily at 2 AM UTC) -@pipeline(schedule=Schedule(cron_expression="0 2 * * *")) +@pipeline def my_scheduled_pipeline(): # Your pipeline steps here pass +my_scheduled_pipeline.with_options( + schedule=Schedule(cron_expression="0 2 * * *") +)() + # Using an interval (runs every 2 hours) -@pipeline(schedule=Schedule(interval_second=timedelta(hours=2))) +@pipeline def my_interval_pipeline(): # Your pipeline steps here pass +my_interval_pipeline.with_options( + schedule=Schedule( + start_time=datetime.now(), + interval_second=timedelta(hours=2) + ) +)() + # Running once at a specific time -@pipeline(schedule=Schedule(run_once_start_time=datetime(2024, 12, 31, 23, 59))) +@pipeline def my_one_time_pipeline(): # Your pipeline steps here pass + +my_one_time_pipeline.with_options( + schedule=Schedule(run_once_start_time=datetime(2024, 12, 31, 23, 59)) +)() ``` When you deploy a scheduled pipeline, ZenML will: + 1. Create a SageMaker Pipeline Schedule with the specified configuration 2. Configure the pipeline as the target for the schedule 3. Enable automatic execution based on the schedule {% hint style="info" %} -If you run the same pipeline with a schedule multiple times, the existing schedule will **not** be updated with the new settings. Rather, ZenML will create a new sagemaker pipeline and attach a new schedule to it. The user must manually delete the old pipeline and their attached schedule using the AWS CLI or API (`aws scheduler delete-schedule `). See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) +If you run the same pipeline with a schedule multiple times, the existing +schedule will **not** be updated with the new settings. Rather, ZenML will +create a new sagemaker pipeline and attach a new schedule to it. The user +must manually delete the old pipeline and their attached schedule using the +AWS CLI or API (`aws scheduler delete-schedule `). See details +here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) {% endhint %} #### Required IAM Permissions @@ -380,60 +431,60 @@ If you run the same pipeline with a schedule multiple times, the existing schedu When using scheduled pipelines, you need to ensure your IAM role (either the service connector role or the configured `scheduler_role`) has the correct permissions and trust relationships: 1. **Trust Relationships** -Your service connector role needs to trust both SageMaker and EventBridge Scheduler services: - -```json -{ - "Version": "2012-10-17", - "Statement": [ + Your service connector role needs to trust both SageMaker and EventBridge Scheduler services: + + ```json { - "Effect": "Allow", - "Principal": { - "AWS": "", ## This is the ARN of the user that is configured in the service connector - # This is the list of services that the service connector role needs to schedule pipelines - "Service": [ - "sagemaker.amazonaws.com", - "scheduler.amazonaws.com" - ] - }, - "Action": "sts:AssumeRole" + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "", ## This is the ARN of the user that is configured in the service connector + # This is the list of services that the service connector role needs to schedule pipelines + "Service": [ + "sagemaker.amazonaws.com", + "scheduler.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] } - ] -} -``` + ``` 2. **Required IAM Policies** -The scheduler role (see below) needs the following permissions to manage scheduled pipelines: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "scheduler:ListSchedules", - "scheduler:GetSchedule", - "scheduler:CreateSchedule", - "scheduler:UpdateSchedule", - "scheduler:DeleteSchedule" - ], - "Resource": "*" - }, + The scheduler role (see below) needs the following permissions to manage scheduled pipelines: + + ```json { - "Effect": "Allow", - "Action": "iam:PassRole", - "Resource": "arn:aws:iam::*:role/*", - "Condition": { - "StringLike": { - "iam:PassedToService": "scheduler.amazonaws.com" + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "scheduler:ListSchedules", + "scheduler:GetSchedule", + "scheduler:CreateSchedule", + "scheduler:UpdateSchedule", + "scheduler:DeleteSchedule" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::*:role/*", + "Condition": { + "StringLike": { + "iam:PassedToService": "scheduler.amazonaws.com" + } + } } - } + ] } - ] -} -``` + ``` Or you can use the `AmazonEventBridgeSchedulerFullAccess` managed policy. @@ -447,7 +498,7 @@ Without these permissions, the scheduling functionality will fail. Make sure to By default, the SageMaker orchestrator will use the attached [service connector role](../../how-to/infrastructure-deployment/auth-management/aws-service-connector.md) to schedule pipelines. However, you can specify a different role to be used for scheduling by configuring the `scheduler_role` parameter: -```python +```bash # When registering the orchestrator zenml orchestrator register sagemaker-orchestrator \ --flavor=sagemaker \ diff --git a/docs/book/component-guide/orchestrators/vertex.md b/docs/book/component-guide/orchestrators/vertex.md index 210d34f931..745d423eb0 100644 --- a/docs/book/component-guide/orchestrators/vertex.md +++ b/docs/book/component-guide/orchestrators/vertex.md @@ -184,7 +184,7 @@ For any runs executed on Vertex, you can get the URL to the Vertex UI in Python from zenml.client import Client pipeline_run = Client().get_pipeline_run("") -orchestrator_url = pipeline_run.run_metadata["orchestrator_url"].value +orchestrator_url = pipeline_run.run_metadata["orchestrator_url"] ``` ### Run pipelines on a schedule @@ -194,24 +194,37 @@ The Vertex Pipelines orchestrator supports running pipelines on a schedule using **How to schedule a pipeline** ```python +from datetime import datetime, timedelta + +from zenml import pipeline from zenml.config.schedule import Schedule +@pipeline +def first_pipeline(): + ... + # Run a pipeline every 5th minute -pipeline_instance.run( +first_pipeline = first_pipeline.with_options( schedule=Schedule( cron_expression="*/5 * * * *" ) ) +first_pipeline() + +@pipeline +def second_pipeline(): + ... # Run a pipeline every hour # starting in one day from now and ending in three days from now -pipeline_instance.run( +second_pipeline = second_pipeline.with_options( schedule=Schedule( - cron_expression="0 * * * *" - start_time=datetime.datetime.now() + datetime.timedelta(days=1), - end_time=datetime.datetime.now() + datetime.timedelta(days=3), + cron_expression="0 * * * *", + start_time=datetime.now() + timedelta(days=1), + end_time=datetime.now() + timedelta(days=3), ) ) +second_pipeline() ``` {% hint style="warning" %} @@ -233,23 +246,32 @@ In order to cancel a scheduled Vertex pipeline, you need to manually delete the For additional configuration of the Vertex orchestrator, you can pass `VertexOrchestratorSettings` which allows you to configure labels for your Vertex Pipeline jobs or specify which GPU to use. ```python -from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import VertexOrchestratorSettings -from kubernetes.client.models import V1Toleration - -vertex_settings = VertexOrchestratorSettings( - labels={"key": "value"} +from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import ( + VertexOrchestratorSettings ) + +vertex_settings = VertexOrchestratorSettings(labels={"key": "value"}) ``` If your pipelines steps have certain hardware requirements, you can specify them as `ResourceSettings`: ```python +from zenml.config import ResourceSettings + resource_settings = ResourceSettings(cpu_count=8, memory="16GB") ``` -To run your pipeline (or some steps of it) on a GPU, you will need to set both a node selector -and the gpu count as follows: +To run your pipeline (or some steps of it) on a GPU, you will need to set both +a node selector and the gpu count as follows: + ```python +from zenml import step, pipeline + +from zenml.config import ResourceSettings +from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import ( + VertexOrchestratorSettings +) + vertex_settings = VertexOrchestratorSettings( pod_settings={ "node_selectors": { @@ -258,33 +280,30 @@ vertex_settings = VertexOrchestratorSettings( } ) resource_settings = ResourceSettings(gpu_count=1) -``` -You can find available accelerator types [here](https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus). - -These settings can then be specified on either pipeline-level or step-level: -```python -# Either specify on pipeline-level -@pipeline( +# Either specify settings on step-level +@step( settings={ "orchestrator": vertex_settings, "resources": resource_settings, } ) -def my_pipeline(): +def my_step(): ... -# OR specify settings on step-level -@step( +# OR specify on pipeline-level +@pipeline( settings={ "orchestrator": vertex_settings, "resources": resource_settings, } ) -def my_step(): +def my_pipeline(): ... ``` +You can find available accelerator types [here](https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus). + Check out the [SDK docs](https://sdkdocs.zenml.io/latest/integration\_code\_docs/integrations-gcp/#zenml.integrations.gcp.flavors.vertex\_orchestrator\_flavor.VertexOrchestratorSettings) for a full list of available attributes and [this docs page](../../how-to/pipeline-development/use-configuration-files/runtime-configuration.md) for more information on how to specify settings. For more information and a full list of configurable attributes of the Vertex orchestrator, check out the [SDK Docs](https://sdkdocs.zenml.io/latest/integration\_code\_docs/integrations-gcp/#zenml.integrations.gcp.orchestrators.vertex\_orchestrator.VertexOrchestrator) . From 98896d4f382b08c8d8fee880033bfc806ab0596a Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 16 Jan 2025 14:47:07 +0100 Subject: [PATCH 40/50] second checkpoint --- .../orchestrators/sagemaker_orchestrator.py | 85 ++++++++++++------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 69bd0052d1..a211b93907 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -239,7 +239,8 @@ def prepare_or_run_pipeline( environment. Raises: - RuntimeError: If there is an error creating or scheduling the pipeline. + RuntimeError: If there is an error creating or scheduling the + pipeline. TypeError: If the network_config passed is not compatible with the AWS SageMaker NetworkConfig class. ValueError: If the schedule is not valid. @@ -484,7 +485,7 @@ def prepare_or_run_pipeline( "deployment process." ) - schedule_name = f"zenml-{deployment.pipeline_configuration.name}" + schedule_name = orchestrator_run_name next_execution = None # Create PipelineSchedule based on schedule type @@ -541,7 +542,8 @@ def prepare_or_run_pipeline( # Get the current role ARN if not explicitly configured if self.config.scheduler_role is None: logger.info( - "No scheduler_role configured. Using service connector role to schedule pipeline." + "No scheduler_role configured. Using service connector " + "role to schedule pipeline." ) sts = session.boto_session.client("sts") try: @@ -560,7 +562,7 @@ def prepare_or_run_pipeline( elif ":assumed-role/" in service_connector_role_arn: # Convert assumed-role ARN format to role ARN format # From: arn:aws:sts::123456789012:assumed-role/role-name/session-name - # To: arn:aws:iam::123456789012:role/role-name + # To: arn:aws:iam::123456789012:role/role-name service_connector_role_arn = re.sub( r"arn:aws:sts::(\d+):assumed-role/([^/]+)/.*", r"arn:aws:iam::\1:role/\2", @@ -568,10 +570,12 @@ def prepare_or_run_pipeline( ) except Exception: raise RuntimeError( - "Failed to get current role ARN from service connector. This means " - "the service connector is not configured correctly to schedule sagemaker " - "pipelines. You can either fix the service connector or configure " - "`scheduler_role` explicitly in your orchestrator config." + "Failed to get current role ARN from service " + "connector. This means the service connector is not " + "configured correctly to schedule sagemaker " + "pipelines. You can either fix the service connector " + "or configure `scheduler_role` explicitly in your " + "orchestrator config." ) else: service_connector_role_arn = self.config.scheduler_role @@ -590,7 +594,7 @@ def prepare_or_run_pipeline( f"{next_execution.strftime('%Y-%m-%d %H:%M:%S UTC')}" if next_execution else f"Using cron expression: " - f"{deployment.schedule.cron_expression}" + f"{deployment.schedule.cron_expression}" ) + ( f" (and every {minutes} minutes after)" @@ -615,7 +619,7 @@ def prepare_or_run_pipeline( # Yield metadata based on the generated execution object yield from self.compute_metadata( - execution=execution, settings=settings + execution_arn=execution.arn, settings=settings ) # mainly for testing purposes, we wait for the pipeline to finish @@ -652,15 +656,28 @@ def get_pipeline_run_metadata( Returns: A dictionary of metadata. """ - # TODO: Here we need to find some relevant metadata to track - # in case of a scheduled pipeline. - # Use ARN to fetch the execution - # Metadata about the schedule - pipeline_execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] + from zenml.client import Client + + execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] run_metadata: Dict[str, "MetadataType"] = { - "pipeline_execution_arn": pipeline_execution_arn, + "pipeline_execution_arn": execution_arn, } + client = Client() + + deployment_id = client.get_pipeline_run(run_id).deployment_id + deployment = client.get_deployment(deployment_id) + + settings = cast( + SagemakerOrchestratorSettings, self.get_settings(deployment) + ) + + for metadata in self.compute_metadata( + execution_arn=execution_arn, + settings=settings, + ): + run_metadata.update(metadata) + return run_metadata def fetch_status(self, run: "PipelineRunResponse") -> ExecutionStatus: @@ -721,13 +738,13 @@ def fetch_status(self, run: "PipelineRunResponse") -> ExecutionStatus: def compute_metadata( self, - execution: Any, + execution_arn: Any, settings: SagemakerOrchestratorSettings, ) -> Iterator[Dict[str, MetadataType]]: """Generate run metadata based on the generated Sagemaker Execution. Args: - execution: The corresponding _PipelineExecution object. + execution_arn: The ARN of the pipeline execution. settings: The Sagemaker orchestrator settings. Yields: @@ -737,40 +754,42 @@ def compute_metadata( metadata: Dict[str, MetadataType] = {} # Orchestrator Run ID - if run_id := self._compute_orchestrator_run_id(execution): - metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id + if execution_arn: + metadata[METADATA_ORCHESTRATOR_RUN_ID] = execution_arn # URL to the Sagemaker's pipeline view - if orchestrator_url := self._compute_orchestrator_url(execution): + if orchestrator_url := self._compute_orchestrator_url( + execution_arn=execution_arn + ): metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url) # URL to the corresponding CloudWatch page if logs_url := self._compute_orchestrator_logs_url( - execution, settings + execution_arn=execution_arn, settings=settings ): metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url) yield metadata - @staticmethod def _compute_orchestrator_url( - pipeline_execution: Any, + self, + execution_arn: Any, ) -> Optional[str]: """Generate the Orchestrator Dashboard URL upon pipeline execution. Args: - pipeline_execution: The corresponding _PipelineExecution object. + execution_arn: The ARN of the pipeline execution. Returns: the URL to the dashboard view in SageMaker. """ try: region_name, pipeline_name, execution_id = ( - dissect_pipeline_execution_arn(pipeline_execution.arn) + dissect_pipeline_execution_arn(execution_arn) ) # Get the Sagemaker session - session = pipeline_execution.sagemaker_session + session = self._get_sagemaker_session() # List the Studio domains and get the Studio Domain ID domains_response = session.sagemaker_client.list_domains() @@ -790,13 +809,13 @@ def _compute_orchestrator_url( @staticmethod def _compute_orchestrator_logs_url( - pipeline_execution: Any, + execution_arn: Any, settings: SagemakerOrchestratorSettings, ) -> Optional[str]: """Generate the CloudWatch URL upon pipeline execution. Args: - pipeline_execution: The corresponding _PipelineExecution object. + execution_arn: The ARN of the pipeline execution. settings: The Sagemaker orchestrator settings. Returns: @@ -804,7 +823,7 @@ def _compute_orchestrator_logs_url( """ try: region_name, _, execution_id = dissect_pipeline_execution_arn( - pipeline_execution.arn + execution_arn ) use_training_jobs = True @@ -827,18 +846,18 @@ def _compute_orchestrator_logs_url( @staticmethod def _compute_orchestrator_run_id( - pipeline_execution: Any, + execution_arn: Any, ) -> Optional[str]: """Fetch the Orchestrator Run ID upon pipeline execution. Args: - pipeline_execution: The corresponding _PipelineExecution object. + execution_arn: The ARN of the pipeline execution. Returns: the Execution ID of the run in SageMaker. """ try: - return str(pipeline_execution.arn) + return str(execution_arn) except Exception as e: logger.warning( From ab8990d26a36e3afef6b2a8ca9016ebada6231e4 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 16 Jan 2025 15:00:48 +0100 Subject: [PATCH 41/50] removing the old dependency limitations --- src/zenml/integrations/huggingface/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/zenml/integrations/huggingface/__init__.py b/src/zenml/integrations/huggingface/__init__.py index f1e1721bbb..1805ce0d92 100644 --- a/src/zenml/integrations/huggingface/__init__.py +++ b/src/zenml/integrations/huggingface/__init__.py @@ -52,11 +52,6 @@ def get_requirements(cls, target_os: Optional[str] = None) -> List[str]: "accelerate", "bitsandbytes>=0.41.3", "peft", - # temporary fix for CI issue similar to: - # - https://github.com/huggingface/datasets/issues/6737 - # - https://github.com/huggingface/datasets/issues/6697 - # TODO try relaxing it back going forward - "fsspec<=2023.12.0", "transformers", ] From 737fb6eef75d803b51b7df81d09414551d0b4565 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 20:31:53 +0100 Subject: [PATCH 42/50] schedule metadata --- .../orchestrators/sagemaker.md | 2 +- src/zenml/enums.py | 1 + .../orchestrators/sagemaker_orchestrator.py | 88 ++++++++++++++----- src/zenml/models/v2/core/schedule.py | 17 +++- .../schemas/pipeline_run_schemas.py | 6 ++ .../zen_stores/schemas/schedule_schema.py | 19 +++- 6 files changed, 109 insertions(+), 24 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index a63230700b..d19c988972 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -22,7 +22,7 @@ You should use the Sagemaker orchestrator if: ## How it works -The ZenML Sagemaker orchestrator works with [Sagemaker Pipelines](https://aws.amazon.com/sagemaker/pipelines), which can be used to construct machine learning pipelines. Under the hood, for each ZenML pipeline step, it creates a SageMaker `PipelineStep`, which contains a Sagemaker Processing job. Currently, other step types are not supported. +The ZenML Sagemaker orchestrator works with [Sagemaker Pipelines](https://aws.amazon.com/sagemaker/pipelines), which can be used to construct machine learning pipelines. Under the hood, for each ZenML pipeline step, it creates a SageMaker `PipelineStep`, which contains a Sagemaker Processing or Training job. ## How to deploy it diff --git a/src/zenml/enums.py b/src/zenml/enums.py index 0469048f3d..e8d15001a2 100644 --- a/src/zenml/enums.py +++ b/src/zenml/enums.py @@ -376,6 +376,7 @@ class MetadataResourceTypes(StrEnum): STEP_RUN = "step_run" ARTIFACT_VERSION = "artifact_version" MODEL_VERSION = "model_version" + SCHEDULE = "schedule" class DatabaseBackupStrategy(StrEnum): diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index a211b93907..e2528f7df2 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -38,13 +38,18 @@ from sagemaker.workflow.steps import ProcessingStep, TrainingStep from sagemaker.workflow.triggers import PipelineSchedule +from zenml.client import Client from zenml.config.base_settings import BaseSettings from zenml.constants import ( METADATA_ORCHESTRATOR_LOGS_URL, METADATA_ORCHESTRATOR_RUN_ID, METADATA_ORCHESTRATOR_URL, ) -from zenml.enums import ExecutionStatus, StackComponentType +from zenml.enums import ( + ExecutionStatus, + MetadataResourceTypes, + StackComponentType, +) from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import ( SagemakerOrchestratorConfig, SagemakerOrchestratorSettings, @@ -71,6 +76,33 @@ logger = get_logger(__name__) +def dissect_schedule_arn( + schedule_arn: str, +) -> Tuple[Optional[str], Optional[str]]: + """Extracts the region and the name from an EventBridge schedule ARN. + + Args: + schedule_arn: The ARN of the EventBridge schedule. + + Returns: + Region Name, Schedule Name (including the group name) + """ + # Split the ARN into parts + arn_parts = schedule_arn.split(":") + + # Validate ARN structure + if len(arn_parts) < 6 or not arn_parts[5].startswith("schedule/"): + raise ValueError("Invalid EventBridge schedule ARN format.") + + # Extract the region + region = arn_parts[3] + + # Extract the group name and schedule name + name = arn_parts[5].split("schedule/")[1] + + return region, name + + def dissect_pipeline_execution_arn( pipeline_execution_arn: str, ) -> Tuple[Optional[str], Optional[str], Optional[str]]: @@ -587,6 +619,28 @@ def prepare_or_run_pipeline( ) logger.info(f"The schedule ARN is: {triggers[0]}") + try: + from zenml.models import RunMetadataResource + + schedule_metadata = self.generate_schedule_metadata( + schedule_arn=triggers[0] + ) + + Client().create_run_metadata( + metadata=schedule_metadata, + resources=[ + RunMetadataResource( + id=deployment.schedule.id, + type=MetadataResourceTypes.SCHEDULE, + ) + ], + ) + except Exception as e: + logger.debug( + "There was a warning attaching the metadata to the ZenML" + f"schedule: {e}" + ) + logger.info( f"Successfully scheduled pipeline with name: {schedule_name}\n" + ( @@ -656,17 +710,16 @@ def get_pipeline_run_metadata( Returns: A dictionary of metadata. """ - from zenml.client import Client execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] run_metadata: Dict[str, "MetadataType"] = { "pipeline_execution_arn": execution_arn, } - client = Client() + zenml_client = Client() - deployment_id = client.get_pipeline_run(run_id).deployment_id - deployment = client.get_deployment(deployment_id) + deployment_id = zenml_client.get_pipeline_run(run_id).deployment_id + deployment = zenml_client.get_deployment(deployment_id) settings = cast( SagemakerOrchestratorSettings, self.get_settings(deployment) @@ -845,25 +898,20 @@ def _compute_orchestrator_logs_url( return None @staticmethod - def _compute_orchestrator_run_id( - execution_arn: Any, - ) -> Optional[str]: - """Fetch the Orchestrator Run ID upon pipeline execution. + def generate_schedule_metadata(schedule_arn: str) -> Dict[str, str]: + """Attaches metadata to the ZenML Schedules. Args: - execution_arn: The ARN of the pipeline execution. - - Returns: - the Execution ID of the run in SageMaker. + schedule_arn: The trigger ARNs that is generated on the AWS side. """ - try: - return str(execution_arn) + region, name = dissect_schedule_arn(schedule_arn=schedule_arn) - except Exception as e: - logger.warning( - f"There was an issue while extracting the pipeline run ID: {e}" - ) - return None + return { + "trigger_url": ( + f"https://{region}.console.aws.amazon.com/scheduler/home" + f"?region={region}#schedules/{name}" + ), + } @staticmethod def _validate_cron_expression(cron_expression: str) -> str: diff --git a/src/zenml/models/v2/core/schedule.py b/src/zenml/models/v2/core/schedule.py index 0e7dc01c42..705b4f30c5 100644 --- a/src/zenml/models/v2/core/schedule.py +++ b/src/zenml/models/v2/core/schedule.py @@ -14,13 +14,14 @@ """Models representing schedules.""" import datetime -from typing import Optional, Union +from typing import Dict, Optional, Union from uuid import UUID from pydantic import Field, model_validator from zenml.constants import STR_FIELD_MAX_LENGTH from zenml.logger import get_logger +from zenml.metadata.metadata_types import MetadataType from zenml.models.v2.base.base import BaseUpdate from zenml.models.v2.base.scoped import ( WorkspaceScopedFilter, @@ -136,6 +137,11 @@ class ScheduleResponseMetadata(WorkspaceScopedResponseMetadata): orchestrator_id: Optional[UUID] pipeline_id: Optional[UUID] + run_metadata: Dict[str, MetadataType] = Field( + title="Metadata associated with this step run.", + default={}, + ) + class ScheduleResponseResources(WorkspaceScopedResponseResources): """Class for all resource models associated with the schedule entity.""" @@ -272,6 +278,15 @@ def pipeline_id(self) -> Optional[UUID]: """ return self.get_metadata().pipeline_id + @property + def run_metadata(self) -> Dict[str, MetadataType]: + """The `run_metadata` property. + + Returns: + the value of the property. + """ + return self.get_metadata().run_metadata + # ------------------ Filter Model ------------------ diff --git a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py index 1481a90d2d..ff6069666e 100644 --- a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py +++ b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py @@ -269,6 +269,12 @@ def fetch_metadata_collection(self) -> Dict[str, List[RunMetadataEntry]]: for k, v in step_metadata.items(): metadata_collection[f"{s.name}::{k}"] = v + # Fetch the metadata related to the schedule of this run + if schedule := self.deployment.schedule: + schedule_metadata = schedule.fetch_metadata_collection() + for k, v in schedule_metadata.items(): + metadata_collection[f"schedule:{k}"] = v + return metadata_collection def to_model( diff --git a/src/zenml/zen_stores/schemas/schedule_schema.py b/src/zenml/zen_stores/schemas/schedule_schema.py index 632744ce3e..911051d7de 100644 --- a/src/zenml/zen_stores/schemas/schedule_schema.py +++ b/src/zenml/zen_stores/schemas/schedule_schema.py @@ -14,11 +14,12 @@ """SQL Model Implementations for Pipeline Schedules.""" from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, List, Optional from uuid import UUID from sqlmodel import Field, Relationship +from zenml.enums import MetadataResourceTypes from zenml.models import ( ScheduleRequest, ScheduleResponse, @@ -31,15 +32,19 @@ from zenml.zen_stores.schemas.pipeline_schemas import PipelineSchema from zenml.zen_stores.schemas.schema_utils import build_foreign_key_field from zenml.zen_stores.schemas.user_schemas import UserSchema +from zenml.zen_stores.schemas.utils import RunMetadataInterface from zenml.zen_stores.schemas.workspace_schemas import WorkspaceSchema if TYPE_CHECKING: from zenml.zen_stores.schemas.pipeline_deployment_schemas import ( PipelineDeploymentSchema, ) + from zenml.zen_stores.schemas.run_metadata_schemas import ( + RunMetadataSchema, + ) -class ScheduleSchema(NamedSchema, table=True): +class ScheduleSchema(NamedSchema, RunMetadataInterface, table=True): """SQL Model for schedules.""" __tablename__ = "schedule" @@ -89,6 +94,15 @@ class ScheduleSchema(NamedSchema, table=True): back_populates="schedules" ) + run_metadata: List["RunMetadataSchema"] = Relationship( + sa_relationship_kwargs=dict( + secondary="run_metadata_resource", + primaryjoin=f"and_(foreign(RunMetadataResourceSchema.resource_type)=='{MetadataResourceTypes.SCHEDULE.value}', foreign(RunMetadataResourceSchema.resource_id)==ScheduleSchema.id)", + secondaryjoin="RunMetadataSchema.id==foreign(RunMetadataResourceSchema.run_metadata_id)", + overlaps="run_metadata", + ), + ) + active: bool cron_expression: Optional[str] = Field(nullable=True) start_time: Optional[datetime] = Field(nullable=True) @@ -196,6 +210,7 @@ def to_model( workspace=self.workspace.to_model(), pipeline_id=self.pipeline_id, orchestrator_id=self.orchestrator_id, + run_metadata=self.fetch_metadata(), ) return ScheduleResponse( From 7f33927e34ebf06596769c1d045d4b16b67c12d2 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 20:32:03 +0100 Subject: [PATCH 43/50] update on the docs --- .../orchestrators/sagemaker.md | 103 +++++++++++------- 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index d19c988972..179144514e 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -54,6 +54,7 @@ zenml integration install aws s3 * A [remote container registry](../container-registries/container-registries.md) as part of your stack. * An IAM role or user with [an `AmazonSageMakerFullAccess` managed policy](https://docs.aws.amazon.com/sagemaker/latest/dg/security-iam-awsmanpol.html) applied to it as well as `sagemaker.amazonaws.com` added as a Principal Service. Full details on these permissions can be found [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) or use the ZenML recipe (when available) which will set up the necessary permissions for you. * The local client (whoever is running the pipeline) will also have to have the necessary permissions or roles to be able to launch Sagemaker jobs. (This would be covered by the `AmazonSageMakerFullAccess` policy suggested above.) +* If you want to use schedules, you also need to set up the correct roles, permissions and policies covered [here](#required-iam-permissions-for-schedules). There are three ways you can authenticate your orchestrator and link it to the IAM role you have created: @@ -114,7 +115,9 @@ If all went well, you should now see the following output: ``` Steps can take 5-15 minutes to start running when using the Sagemaker Orchestrator. -Your orchestrator 'sagemaker' is running remotely. Note that the pipeline run will only show up on the ZenML dashboard once the first step has started executing on the remote infrastructure. +Your orchestrator 'sagemaker' is running remotely. Note that the pipeline run +will only show up on the ZenML dashboard once the first step has started +executing on the remote infrastructure. ``` {% hint style="warning" %} @@ -365,7 +368,9 @@ Note that if you wish to use this orchestrator to run steps on a GPU, you will n ### Scheduling Pipelines -The SageMaker orchestrator supports running pipelines on a schedule using SageMaker's native scheduling capabilities. You can configure schedules in three ways: +The SageMaker orchestrator supports running pipelines on a schedule using +SageMaker's native scheduling capabilities. You can configure schedules in +three ways: * Using a cron expression * Using a fixed interval @@ -426,13 +431,45 @@ AWS CLI or API (`aws scheduler delete-schedule `). See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) {% endhint %} -#### Required IAM Permissions +#### Required IAM Permissions for schedules -When using scheduled pipelines, you need to ensure your IAM role (either the service connector role or the configured `scheduler_role`) has the correct permissions and trust relationships: +When using scheduled pipelines, you need to ensure your IAM role has the +correct permissions and trust relationships. You can set this up by either +defining an explicit `scheduler_role` in your orchestrator configuration or +you can adjust the role that you are already using on the client side to manage +Sagemaker pipelines. + +```bash +# When registering the orchestrator +zenml orchestrator register sagemaker-orchestrator \ + --flavor=sagemaker \ + --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role + +# Or updating an existing orchestrator +zenml orchestrator update sagemaker-orchestrator \ + --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role +``` + +{% hint style="info" %} +The IAM role that you are using on the client side can come from multiple +sources depending on how you configured your orchestrator, such as explicit +credentials, a service connector or an implicit authentication. + +If you are using a service connector, keep in mind, this only works with +authentication methods that involve IAM roles (IAM role, Implicit +authentication). LINK +{% endhint %} + +This is particularly useful when: + +* You want to use different roles for creating pipelines and scheduling them +* Your organization's security policies require separate roles for different operations +* You need to grant specific permissions only to the scheduling operations 1. **Trust Relationships** - Your service connector role needs to trust both SageMaker and EventBridge Scheduler services: - + Your `scheduler_role` (or your client role if you did not configure + a `scheduler_role`) needs to be assumed by the EventBridge Scheduler + service: ```json { "Version": "2012-10-17", @@ -440,10 +477,8 @@ When using scheduled pipelines, you need to ensure your IAM role (either the ser { "Effect": "Allow", "Principal": { - "AWS": "", ## This is the ARN of the user that is configured in the service connector - # This is the list of services that the service connector role needs to schedule pipelines + "AWS": "", "Service": [ - "sagemaker.amazonaws.com", "scheduler.amazonaws.com" ] }, @@ -453,9 +488,11 @@ When using scheduled pipelines, you need to ensure your IAM role (either the ser } ``` -2. **Required IAM Policies** +2. **Required IAM Permissions for the client role** - The scheduler role (see below) needs the following permissions to manage scheduled pipelines: + In addition to permissions needed to manage pipelines, the role on the +client side also needs the following permissions to create schedules on +EventBridge: ```json { @@ -486,38 +523,22 @@ When using scheduled pipelines, you need to ensure your IAM role (either the ser } ``` -Or you can use the `AmazonEventBridgeSchedulerFullAccess` managed policy. - -These permissions enable: -* Creation and management of Pipeline Schedules -* Setting up trust relationships between services -* Managing IAM policies required for the scheduled execution -* Cleanup of resources when schedules are removed - -Without these permissions, the scheduling functionality will fail. Make sure to configure them before attempting to use scheduled pipelines. - -By default, the SageMaker orchestrator will use the attached [service connector role](../../how-to/infrastructure-deployment/auth-management/aws-service-connector.md) to schedule pipelines. However, you can specify a different role to be used for scheduling by configuring the `scheduler_role` parameter: - -```bash -# When registering the orchestrator -zenml orchestrator register sagemaker-orchestrator \ - --flavor=sagemaker \ - --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role - -# Or updating an existing orchestrator -zenml orchestrator update sagemaker-orchestrator \ - --scheduler_role=arn:aws:iam::123456789012:role/my-scheduler-role -``` + Or you can use the `AmazonEventBridgeSchedulerFullAccess` managed policy. + + These permissions enable: -This is particularly useful when: -* You want to use different roles for creating pipelines and scheduling them -* Your organization's security policies require separate roles for different operations -* You need to grant specific permissions only to the scheduling operations + * Creation and management of Pipeline Schedules + * Setting up trust relationships between services + * Managing IAM policies required for the scheduled execution + * Cleanup of resources when schedules are removed -If no `scheduler_role` is configured, the orchestrator will: -1. Use the service connector's role -2. Log an informative message about using the service connector role -3. Handle both user credentials and assumed role scenarios appropriately + Without these permissions, the scheduling functionality will fail. Make +sure to configure them before attempting to use scheduled pipelines. +3. **Required IAM Permissions for the `scheduler_role`** + + The `scheduler_role` requires the same permissions as the client role (that +would run the pipeline in a non-scheduled case) to launch and manage Sagemaker +jobs. This would be covered by the `AmazonSageMakerFullAccess` permission.
ZenML Scarf
\ No newline at end of file From 689df66215499337935e47e32c7688974a107c3c Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 20:32:42 +0100 Subject: [PATCH 44/50] fixing the endpoint --- src/zenml/zen_server/routers/workspaces_endpoints.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zenml/zen_server/routers/workspaces_endpoints.py b/src/zenml/zen_server/routers/workspaces_endpoints.py index 32af09d2a4..41b33787e0 100644 --- a/src/zenml/zen_server/routers/workspaces_endpoints.py +++ b/src/zenml/zen_server/routers/workspaces_endpoints.py @@ -1009,6 +1009,8 @@ def create_run_metadata( verify_models.append(zen_store().get_artifact_version(resource.id)) elif resource.type == MetadataResourceTypes.MODEL_VERSION: verify_models.append(zen_store().get_model_version(resource.id)) + elif resource.type == MetadataResourceTypes.SCHEDULE: + verify_models.append(zen_store().get_schedule(resource.id)) else: raise RuntimeError(f"Unknown resource type: {resource.type}") From 2d7dc6c605d5ffcf89deee99c8a7b981cdec9422 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 20:32:52 +0100 Subject: [PATCH 45/50] formatting --- .../integrations/aws/orchestrators/sagemaker_orchestrator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index e2528f7df2..e604f4a591 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -637,7 +637,7 @@ def prepare_or_run_pipeline( ) except Exception as e: logger.debug( - "There was a warning attaching the metadata to the ZenML" + "There was an error attaching metadata to the " f"schedule: {e}" ) @@ -710,7 +710,6 @@ def get_pipeline_run_metadata( Returns: A dictionary of metadata. """ - execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID] run_metadata: Dict[str, "MetadataType"] = { "pipeline_execution_arn": execution_arn, From c5dfd58db97beab343a6ae68d00f88618a6839ad Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 20:46:28 +0100 Subject: [PATCH 46/50] improving the error messages --- .../orchestrators/sagemaker_orchestrator.py | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index e604f4a591..f726742d77 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -574,48 +574,57 @@ def prepare_or_run_pipeline( # Get the current role ARN if not explicitly configured if self.config.scheduler_role is None: logger.info( - "No scheduler_role configured. Using service connector " - "role to schedule pipeline." + "No scheduler_role configured. Trying to extract it from " + "the client side authentication." ) sts = session.boto_session.client("sts") try: - service_connector_role_arn = sts.get_caller_identity()[ - "Arn" - ] + scheduler_role_arn = sts.get_caller_identity()["Arn"] # If this is a user ARN, try to get the role ARN - if ":user/" in service_connector_role_arn: + if ":user/" in scheduler_role_arn: logger.warning( f"Using IAM user credentials " - f"({service_connector_role_arn}). For production " + f"({scheduler_role_arn}). For production " "environments, it's recommended to use IAM roles " "instead." ) # If this is an assumed role, extract the role ARN - elif ":assumed-role/" in service_connector_role_arn: + elif ":assumed-role/" in scheduler_role_arn: # Convert assumed-role ARN format to role ARN format # From: arn:aws:sts::123456789012:assumed-role/role-name/session-name # To: arn:aws:iam::123456789012:role/role-name - service_connector_role_arn = re.sub( + scheduler_role_arn = re.sub( r"arn:aws:sts::(\d+):assumed-role/([^/]+)/.*", r"arn:aws:iam::\1:role/\2", - service_connector_role_arn, + scheduler_role_arn, + ) + elif ":role/" not in scheduler_role_arn: + raise RuntimeError( + f"Unexpected credential type " + f"({scheduler_role_arn}). Please use IAM " + f"roles for SageMaker pipeline scheduling." + ) + else: + raise RuntimeError( + "The ARN of the caller identity " + f"`{scheduler_role_arn}` does not " + "include a user or a proper role." ) except Exception: raise RuntimeError( - "Failed to get current role ARN from service " - "connector. This means the service connector is not " - "configured correctly to schedule sagemaker " - "pipelines. You can either fix the service connector " - "or configure `scheduler_role` explicitly in your " - "orchestrator config." + "Failed to get current role ARN. This means the " + "your client side credentials that you are " + "is not configured correctly to schedule sagemaker " + "pipelines. For more information, please check:" + "https://docs.zenml.io/stack-components/orchestrators/sagemaker#required-iam-permissions-for-schedules" ) else: - service_connector_role_arn = self.config.scheduler_role + scheduler_role_arn = self.config.scheduler_role # Attach schedule to pipeline triggers = pipeline.put_triggers( triggers=[schedule], - role_arn=service_connector_role_arn, + role_arn=scheduler_role_arn, ) logger.info(f"The schedule ARN is: {triggers[0]}") From 2f88d09a2c9f1f0312b9c62b10cdd33283a42f90 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 21:25:49 +0100 Subject: [PATCH 47/50] docstrings, linting and formatting --- .../orchestrators/sagemaker_orchestrator.py | 30 ++++++++++++------- .../schemas/pipeline_run_schemas.py | 9 +++--- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index f726742d77..63cbb50e64 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -86,6 +86,9 @@ def dissect_schedule_arn( Returns: Region Name, Schedule Name (including the group name) + + Raises: + ValueError: If the input is not a properly formatted ARN. """ # Split the ARN into parts arn_parts = schedule_arn.split(":") @@ -636,7 +639,7 @@ def prepare_or_run_pipeline( ) Client().create_run_metadata( - metadata=schedule_metadata, + metadata=schedule_metadata, # type: ignore[arg-type] resources=[ RunMetadataResource( id=deployment.schedule.id, @@ -726,18 +729,20 @@ def get_pipeline_run_metadata( zenml_client = Client() - deployment_id = zenml_client.get_pipeline_run(run_id).deployment_id - deployment = zenml_client.get_deployment(deployment_id) + if deployment_id := zenml_client.get_pipeline_run( + run_id + ).deployment_id: + deployment = zenml_client.get_deployment(deployment_id) - settings = cast( - SagemakerOrchestratorSettings, self.get_settings(deployment) - ) + settings = cast( + SagemakerOrchestratorSettings, self.get_settings(deployment) + ) - for metadata in self.compute_metadata( - execution_arn=execution_arn, - settings=settings, - ): - run_metadata.update(metadata) + for metadata in self.compute_metadata( + execution_arn=execution_arn, + settings=settings, + ): + run_metadata.update(metadata) return run_metadata @@ -911,6 +916,9 @@ def generate_schedule_metadata(schedule_arn: str) -> Dict[str, str]: Args: schedule_arn: The trigger ARNs that is generated on the AWS side. + + Returns: + a dictionary containing metadata related to the schedule. """ region, name = dissect_schedule_arn(schedule_arn=schedule_arn) diff --git a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py index ff6069666e..05d6f0e3e9 100644 --- a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py +++ b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py @@ -270,10 +270,11 @@ def fetch_metadata_collection(self) -> Dict[str, List[RunMetadataEntry]]: metadata_collection[f"{s.name}::{k}"] = v # Fetch the metadata related to the schedule of this run - if schedule := self.deployment.schedule: - schedule_metadata = schedule.fetch_metadata_collection() - for k, v in schedule_metadata.items(): - metadata_collection[f"schedule:{k}"] = v + if self.deployment is not None: + if schedule := self.deployment.schedule: + schedule_metadata = schedule.fetch_metadata_collection() + for k, v in schedule_metadata.items(): + metadata_collection[f"schedule:{k}"] = v return metadata_collection From 432683655a264381c4a1fc87857e304aedf00a77 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 19 Jan 2025 21:30:00 +0100 Subject: [PATCH 48/50] adding triggers to mocked_libs --- docs/mocked_libs.json | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/mocked_libs.json b/docs/mocked_libs.json index 338ccdea82..4f385181ac 100644 --- a/docs/mocked_libs.json +++ b/docs/mocked_libs.json @@ -188,6 +188,7 @@ "sagemaker.workflow.execution_variables", "sagemaker.workflow.pipeline", "sagemaker.workflow.steps", + "sagemaker.workflow.triggers", "scipy", "scipy.sparse", "sklearn", From 770f104c5cd5d7beab8ba11720bee2f4ac3ba8a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20Can=20Durak?= <36421093+bcdurak@users.noreply.github.com> Date: Sun, 19 Jan 2025 21:30:36 +0100 Subject: [PATCH 49/50] Update docs/book/component-guide/orchestrators/vertex.md Co-authored-by: hyperlint-ai[bot] <154288675+hyperlint-ai[bot]@users.noreply.github.com> --- docs/book/component-guide/orchestrators/vertex.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/book/component-guide/orchestrators/vertex.md b/docs/book/component-guide/orchestrators/vertex.md index 745d423eb0..8045306a0b 100644 --- a/docs/book/component-guide/orchestrators/vertex.md +++ b/docs/book/component-guide/orchestrators/vertex.md @@ -262,7 +262,7 @@ resource_settings = ResourceSettings(cpu_count=8, memory="16GB") ``` To run your pipeline (or some steps of it) on a GPU, you will need to set both -a node selector and the gpu count as follows: +a node selector and the GPU count as follows: ```python from zenml import step, pipeline From 23f37cb096e95b7714f17d1f1ce592f39d6024f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20Can=20Durak?= <36421093+bcdurak@users.noreply.github.com> Date: Sun, 19 Jan 2025 21:30:47 +0100 Subject: [PATCH 50/50] Update docs/book/component-guide/orchestrators/sagemaker.md Co-authored-by: hyperlint-ai[bot] <154288675+hyperlint-ai[bot]@users.noreply.github.com> --- docs/book/component-guide/orchestrators/sagemaker.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/book/component-guide/orchestrators/sagemaker.md b/docs/book/component-guide/orchestrators/sagemaker.md index 179144514e..f7178ca0e8 100644 --- a/docs/book/component-guide/orchestrators/sagemaker.md +++ b/docs/book/component-guide/orchestrators/sagemaker.md @@ -425,7 +425,7 @@ When you deploy a scheduled pipeline, ZenML will: {% hint style="info" %} If you run the same pipeline with a schedule multiple times, the existing schedule will **not** be updated with the new settings. Rather, ZenML will -create a new sagemaker pipeline and attach a new schedule to it. The user +create a new SageMaker pipeline and attach a new schedule to it. The user must manually delete the old pipeline and their attached schedule using the AWS CLI or API (`aws scheduler delete-schedule `). See details here: [SageMaker Pipeline Schedules](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html)