From aaea967e05e2bcbe78e969b73b2e4d22996c38c4 Mon Sep 17 00:00:00 2001 From: Prafulla Mahindrakar Date: Tue, 24 Oct 2023 15:43:08 -0700 Subject: [PATCH 01/43] Make fetching HttpClient optional from context in PKCE auth flow (#4295) * Make fetching HttpClient optional from context in PKCE auth flow Signed-off-by: pmahindrakar-oss * Added log statement Signed-off-by: pmahindrakar-oss * added a unit test Signed-off-by: pmahindrakar-oss * nit : spellcheck Signed-off-by: pmahindrakar-oss --------- Signed-off-by: pmahindrakar-oss --- flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go | 4 ++-- .../clients/go/admin/pkce/auth_flow_orchestrator_test.go | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go index cf5a85671e..5d194851f2 100644 --- a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go +++ b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go @@ -2,7 +2,6 @@ package pkce import ( "context" - "errors" "fmt" "net/http" "net/url" @@ -68,7 +67,8 @@ func (f TokenOrchestrator) FetchTokenFromAuthFlow(ctx context.Context) (*oauth2. // Pass along http client used in oauth2 httpClient, ok := ctx.Value(oauth2.HTTPClient).(*http.Client) if !ok { - return nil, errors.New("Unable to retrieve httpClient used in oauth2 from context") + logger.Debugf(ctx, "using default http client instead") + httpClient = http.DefaultClient } serveMux.HandleFunc(redirectURL.Path, getAuthServerCallbackHandler(f.ClientConfig, pkceCodeVerifier, diff --git a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go index 4799e3fabd..dc1c80f63a 100644 --- a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go +++ b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go @@ -2,6 +2,7 @@ package pkce import ( "context" + "os/exec" "testing" "github.com/stretchr/testify/assert" @@ -29,5 +30,10 @@ func TestFetchFromAuthFlow(t *testing.T) { refreshedToken, err := orchestrator.FetchTokenFromAuthFlow(ctx) assert.Nil(t, refreshedToken) assert.NotNil(t, err) + // Throws an exitError since browser cannot be opened. + // But this makes sure that at least code is tested till the browser action is invoked. + // Earlier change had broken this by introducing a hard dependency on the http Client from the context + _, isAnExitError := err.(*exec.ExitError) + assert.True(t, isAnExitError) }) } From 4bad6c293a6aa4d1113e524da2639e07eb1860ea Mon Sep 17 00:00:00 2001 From: Ashish Mishra <93093775+Ash0807@users.noreply.github.com> Date: Wed, 25 Oct 2023 05:04:35 +0530 Subject: [PATCH 02/43] Delete CONTRIBUTORS.md (#4288) Signed-off-by: Ashish Mishra <93093775+Ash0807@users.noreply.github.com> --- CONTRIBUTORS.md | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 CONTRIBUTORS.md diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md deleted file mode 100644 index eaab5d9c28..0000000000 --- a/CONTRIBUTORS.md +++ /dev/null @@ -1,5 +0,0 @@ -### Contributors - - - - From 1a5af6dc805b670be193190d64b0bccfabac0aee Mon Sep 17 00:00:00 2001 From: Adarsh Jha <132337675+adarsh-jha-dev@users.noreply.github.com> Date: Wed, 25 Oct 2023 14:31:30 +0530 Subject: [PATCH 03/43] Update README.md (#4269) Signed-off-by: Adarsh Jha <132337675+adarsh-jha-dev@users.noreply.github.com> --- rfc/README.md | 114 ++++++++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 50 deletions(-) diff --git a/rfc/README.md b/rfc/README.md index 0490072aa0..1c451d5d29 100644 --- a/rfc/README.md +++ b/rfc/README.md @@ -2,105 +2,119 @@ >*"To think, you have to write. If you're thinking without writing, you only think you're thinking"*. Leslie Lamport +## Table of Contents +- [Introduction](#introduction) +- [When to Follow this Process](#when-to-follow-this-process) +- [Before Creating an RFC](#before-creating-an-rfc) +- [Where to Store RFCs](#where-to-store-rfcs) +- [The Process](#the-process) + - [Opening a Pull Request](#opening-a-pull-request) + - [Acceptance Criteria](#acceptance-criteria) +- [Implementing RFCs](#implementing-rfcs) +- [RFC Postponement](#rfc-postponement) +- [Contributions](#contributions) +- [Prior Art](#prior-art) + +## Introduction + An RFC - Request For Comments - is a document issued mainly to recommend changes to the language and standard libraries. The RFC process aims to provide a consistent and controlled path for changes to enter the project, ensuring that all stakeholders can review and approve them before they are implemented. -## When you need to follow this process -[When you need to follow this process]: #when-you-need-to-follow-this-process +## When to Follow this Process + You need to follow this process if you intend to make a significant change to the system or changes to the user experience. What constitutes a "significant" change is evolving based on community feedback, but may include the following: - - Semantics or syntax changes, other than bugfixes (e.g. introducing a new Flytekit language feature) - - Core-Language updates (e.g. FlyteIdl changes that introduce a new wire-format) - - Documentation restructuring - - Service-to-Service changes (e.g. new gRPC APIs) - - Breaking changes - - Governance changes +- Semantics or syntax changes, other than bugfixes (e.g. introducing a new Flytekit language feature) +- Core-Language updates (e.g. FlyteIdl changes that introduce a new wire-format) +- Documentation restructuring +- Service-to-Service changes (e.g. new gRPC APIs) +- Breaking changes +- Governance changes Changes not requiring RFCs include: - - Documentation rephrasing or reorganizing - - Addition of images, tables, etc. to supplement documentation - - Updates to code samples, links, examples, etc. - - Bug fixes that do not introduce changes listed above (use your judgement). +- Documentation rephrasing or reorganizing +- Addition of images, tables, etc. to supplement documentation +- Updates to code samples, links, examples, etc. +- Bug fixes that do not introduce changes listed above (use your judgment). -**NOTE:** new feature implementation will almost certainly require an RFC; PRs attempting to do so will not be approved without a merged RFC. +**NOTE:** New feature implementation will almost certainly require an RFC; PRs attempting to do so will not be approved without a merged RFC. -## Before creating an RFC -[Before creating an RFC]: #before-creating-an-rfc +## Before Creating an RFC Some good practices when starting an RFC include: - Gathering feedback from team or community members first, to confirm that the changes will indeed be useful - Starting a Discussion at the [RFC Incubator](https://github.com/flyteorg/flyte/discussions/new?category=rfc-incubator) to gauge interest. Once you have received positive feedback, especially from Maintainers or Steering Committee members, please proceed to: - - Ask in the Discussion for confirmation to submit an RFC - - If there's no objection (silence is approval) then create an Issue from the Discussion ([see how](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue#creating-an-issue-from-discussion)) - - Proceed to [open a PR](#opening-a-pull-request) + - Ask in the Discussion for confirmation to submit an RFC + - If there's no objection (silence is approval) then create an Issue from the Discussion ([see how](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue#creating-an-issue-from-discussion)) + - Proceed to [open a PR](#opening-a-pull-request) - Discussing the topic on the [#contribute](https://flyte-org.slack.com/archives/C04NJPLRWUX) Slack channel - Adding the topic to the Contributor's [meeting agenda](https://hackmd.io/@davidmirror/rkqCpbK1n) to make sure it aligns with roadmap - Taking the time to produce a well-written, well-thought-of document by using the template located [here](https://github.com/flyteorg/flyte/blob/RFC-Process/rfc/RFC-0000-Template.md). -## Where to store RFCs +## Where to Store RFCs The Flyte repo on GitHub has an RFC folder with 3 directories: -- **Core language:** proposals to `FlyteIdl` that change the wire-format in any way are considered significant changes that require revision and approval. + +- **Core language:** Proposals to `FlyteIdl` that change the wire-format in any way are considered significant changes that require revision and approval. Reviewers: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md) and one of [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -- **General System:** Changes to other repos that introduce significant change of behavior or user-impacting features. + +- **General System:** Changes to other repos that introduce a significant change of behavior or user-impacting features. Reviewers: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md) and one of [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -- **CI-CD:** Significant changes to CI-CD System that have impact across different repositories. + +- **CI-CD:** Significant changes to CI-CD System that have an impact across different repositories. Reviewer: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -## The process -![](RFC-Process-diagram-v2.png) +## The Process + ### Opening a Pull Request -* Fork the [flyte repository](https://github.com/flyteorg/flyte). -* Copy `rfc/RFC-0000-Template.md` to `rfc//0000-my-feature.md` (where "my-feature" is descriptive). Don't assign an RFC number yet; this is going to be the PR number and we'll rename the file accordingly if the RFC is accepted. -* Fill in the RFC. Put care into the details. -* Submit a pull request. As a pull request the RFC will receive design feedback from the larger community, and the author should be prepared to revise it in response. -* Now that your RFC has an open pull request, use the pull request number (e.g. `123` from `https://github.com/flyteorg/flyte/pull/123`) to update your 0000- prefix to that number. +- Fork the [flyte repository](https://github.com/flyteorg/flyte). +- Copy `rfc/RFC-0000-Template.md` to `rfc//0000-my-feature.md` (where "my-feature" is descriptive). Don't assign an RFC number yet; this is going to be the PR number and we'll rename the file accordingly if the RFC is accepted. +- Fill in the RFC. Put care into the details. +- Submit a pull request. As a pull request, the RFC will receive design feedback from the larger community, and the author should be prepared to revise it in response. +- Now that your RFC has an open pull request, use the pull request number (e.g. `123` from `https://github.com/flyteorg/flyte/pull/123`) to update your 0000- prefix to that number. Once a pull request is opened, the RFC is now in development and the following will happen: -* It will be introduced in a future Contributor's meetup, happening every other week, except otherwise informed. -* The proposal will be discussed as much as possible in the RFC pull request directly. Any outside discussion will be summarized in the comment thread. -* When deemed "ready", a maintainer or TSC member will propose a "motion for Final Comment Period (FCP)" along with a disposition of the outcome (merge, close, or postpone). This step is taken when enough discussions of the tradeoffs have taken place and the community is in a position to make a decision. -* The proposal enters FCP unless there's any objection (lazy consensus). -* The Final Comment Period will last 7 days. If there's no objection, the FCP can close early. -* If no substantial new arguments or ideas are raised, the FCP will follow the outcome decided. -* The RFC author could also withdraw the proposal if they decide to accept a competing proposal as a better alternative or if they deem the original idea as not applicable anymore. +- It will be introduced in a future Contributor's meetup, happening every other week, except otherwise informed. +- The proposal will be discussed as much as possible in the RFC pull request directly. Any outside discussion will be summarized in the comment thread. +- When deemed "ready," a maintainer or TSC member will propose a "motion for Final Comment Period (FCP)" along with a disposition of the outcome (merge, close, or postpone). This step is taken when enough discussions of the tradeoffs have taken place and the community is in a position to make a decision. +- The proposal enters FCP unless there's any objection (lazy consensus). +- The Final Comment Period will last 7 days. If there's no objection, the FCP can close early. +- If no substantial new arguments or ideas are raised, the FCP will follow the outcome decided. +- The RFC author could also withdraw the proposal if they decide to accept a competing proposal as a better alternative or if they deem the original idea as not applicable anymore. -### Acceptance criteria +### Acceptance Criteria A proposal is considered Accepted when it has: -* Completed the FCP with no significant objections -* Received an approval vote from a supermajority (2/3) of the [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md)'s members - +- Completed the FCP with no significant objections +- Received an approval vote from a supermajority (2/3) of the [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md)'s members ## Implementing RFCs -RFCs for vital features may need to be implemented immediately, whereas other RFCs may need to wait for the right developer to be available. - -The author of an RFC may not always be the one to implement it, but if they do, they are welcome to submit an implementation for review after the RFC has been accepted. +RFCs for vital features may need to be implemented immediately, whereas other RFCs may need to wait for the right developer to be available. The author of an RFC may not always be the one to implement it, but if they do, they are welcome to submit an implementation for review after the RFC has been accepted. ## RFC Postponement -Rejected RFCs may be labelled as "postponed". This means the PR has been closed without implementation, because either: +Rejected RFCs may be labeled as "postponed." This means the PR has been closed without implementation, because either: - the proposed change will need to wait for prerequisite features, - the business model may not be presently able to support it, - or other more urgent changes may need to be implemented first. Although "postponed" RFCs have already gone through the approval process, it is always recommended that they be reviewed again prior to implementation, to make sure the existing ecosystem will still benefit from the change. -Alternatively, if an RFC is not likely to be implemented, it should be labelled as "rejected" and permanently closed. +Alternatively, if an RFC is not likely to be implemented, it should be labeled as "rejected" and permanently closed. ## Contributions -[Contributions]: #contributions Any contribution to FlyteOrg repos will be licensed with [Apache-2.0](https://github.com/flyteorg/flyte/blob/master/LICENSE) license without any additional terms or conditions. -## Prior art -* [Rust RFC process](https://github.com/rust-lang/rfcs) -* [Python PEP process](https://peps.python.org/pep-0001/) -* [ASF community design](https://community.apache.org/committers/lazyConsensus.html) +## Prior Art + +- [Rust RFC process](https://github.com/rust-lang/rfcs) +- [Python PEP process](https://peps.python.org/pep-0001/) +- [ASF community design](https://community.apache.org/committers/lazyConsensus.html) From 6cc1f15b2552ad2178a47c193b18b414a439aba0 Mon Sep 17 00:00:00 2001 From: David Espejo <82604841+davidmirror-ops@users.noreply.github.com> Date: Wed, 25 Oct 2023 10:31:52 -0500 Subject: [PATCH 04/43] Add Slack guidelines (#4293) Signed-off-by: davidmirror-ops --- rsts/community/index.rst | 72 ++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/rsts/community/index.rst b/rsts/community/index.rst index 5a9d9b620d..c2ee55ae23 100644 --- a/rsts/community/index.rst +++ b/rsts/community/index.rst @@ -26,14 +26,6 @@ Please join us on: :target: https://www.linkedin.com/groups/13962256 :alt: LinkedIn -.. TODO: add back when new newsletter is up and running -.. Also, feel free to sign up for our newsletter, Flyte Monthly, for a quick update on what we've been up to and upcoming events. - -.. .. link-button:: https://www.getrevue.co/profile/flyte -.. :type: url -.. :text: Flyte Monthly -.. :classes: btn-outline-secondary - Open Source Community Meeting ----------------------------- @@ -60,7 +52,7 @@ Schedule your session depending on the topic to secure the availability of a mai - Plugin implementation - Platform configuration - **1:00p.m. PT**: - - Flyte deployment on cloud + - Flyte deployment, auth - **9:00p.m. PT**: - Flytekit-related - Use cases @@ -68,8 +60,68 @@ Schedule your session depending on the topic to secure the availability of a mai - Integrations - Newsletter ---------- `Join the Flyte mailing list `_ to receive the monthly newsletter + + +Slack guidelines +----------------- + +Flyte strives to build and maintain an open, inclusive, productive and self-governing open source community. In consequence, +we expect all community members to respect the following guidelines: + +Abide by the `LF's Code of Conduct `__ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +As a Linux Foundation project, we must enforce the rules that govern professional and positive open source communities. + +Avoid using DMs and @mentions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Whenever possible, post your questions and responses in public channels so other Community Members can benefit from the conversation and outcomes. +Exceptions to this are when you need to share private or sensible information. In such a case, the outcome should still be shared publicly. +Limit the use of @mentions of other Community Members to be considerate of notification noise. + +Make use of threads +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Threads help us keep conversations contained and organized, reducing the time it takes to give you the support you need. + +Thread best practices: + +- Don't break your question into multiple messages. Put everything in one. +- For long questions, write a few sentences in the first message, and put the rest in a thread. +- If there's a code snippet (more than 5 lines of code), put it inside the thread. +- Avoid using the “Also send to channel” feature unless it's really necessary. +- If your question contains multiple questions, make sure to break them into multiple messages, so each could be answered in a separate thread. + + +Do not post the same question across multiple channels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you consider that question needs to be shared on other channels, ask it once and then indicate explicitly that you're cross-posting. + +If you're having a tough time getting the support you need (or aren't sure where to go!), please DM @David Espejo(he/him) or @Samhita Alla for support. + +Do not solicit members of our Slack +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Flyte Community exists to collaborate with, learn from, and support one another. It is not a space to pitch your products or services directly to our members via public channels, private channels, or direct messages. + +We are excited to have a growing presence from vendors to help answer questions from Community Members as they may arise, but we have a strict 3-strike policy against solicitation: + +- First occurrence: We'll give you a friendly but public reminder that the behavior is inappropriate according to our guidelines. +- Second occurrence: We'll send you a DM warning that any additional violations will result in removal from the community. +- Third occurrence: We'll delete or ban your account. + +We reserve the right to ban users without notice if they are clearly spamming our Community Members. + +If you want to promote a product or service, go to the #shameless-promotion channel and make sure to follow these rules: + +- Don't post more than two promotional posts per week +- Non-relevant topics aren't allowed + +Messages that don't follow these rules will be deleted. + + From 10748266737a77f74221fdef14acd1fbe87039a2 Mon Sep 17 00:00:00 2001 From: Andrew Dye Date: Wed, 25 Oct 2023 11:49:20 -0700 Subject: [PATCH 05/43] Restore deprecated NotificationsConfig.Region field (#4299) Signed-off-by: Andrew Dye --- charts/flyte-core/templates/admin/configmap.yaml | 5 +++++ flyteadmin/pkg/async/notifications/factory.go | 12 ++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/charts/flyte-core/templates/admin/configmap.yaml b/charts/flyte-core/templates/admin/configmap.yaml index 53af659e8b..04e5cac6b3 100644 --- a/charts/flyte-core/templates/admin/configmap.yaml +++ b/charts/flyte-core/templates/admin/configmap.yaml @@ -48,6 +48,11 @@ data: notifications.yaml: | notifications: type: {{ .Values.workflow_notifications.config.notifications.type }} + {{- if not .Values.workflow_notifications.config.notifications.aws }} + {{- with .Values.workflow_notifications.config.notifications.region }} + region: {{ tpl . $ }} + {{- end }} + {{- end }} {{- if eq .Values.workflow_notifications.config.notifications.type "aws" }} {{- with .Values.workflow_notifications.config.notifications.aws }} aws: {{ tpl (toYaml .) $ | nindent 8 }} diff --git a/flyteadmin/pkg/async/notifications/factory.go b/flyteadmin/pkg/async/notifications/factory.go index 53e96f7e67..f94129a1d5 100644 --- a/flyteadmin/pkg/async/notifications/factory.go +++ b/flyteadmin/pkg/async/notifications/factory.go @@ -64,7 +64,11 @@ func GetEmailer(config runtimeInterfaces.NotificationsConfig, scope promutils.Sc switch config.Type { case common.AWS: - awsConfig := aws.NewConfig().WithRegion(config.Region).WithMaxRetries(maxRetries) + region := config.AWSConfig.Region + if region == "" { + region = config.Region + } + awsConfig := aws.NewConfig().WithRegion(region).WithMaxRetries(maxRetries) awsSession, err := session.NewSession(awsConfig) if err != nil { panic(err) @@ -98,7 +102,11 @@ func NewNotificationsProcessor(config runtimeInterfaces.NotificationsConfig, sco // However, the message body of SQS is the SNS message format which isn't Base64 encoded. ConsumeBase64: &enable64decoding, } - sqsConfig.Region = config.Region + if config.AWSConfig.Region != "" { + sqsConfig.Region = config.AWSConfig.Region + } else { + sqsConfig.Region = config.Region + } var err error err = async.Retry(reconnectAttempts, reconnectDelay, func() error { sub, err = gizmoAWS.NewSubscriber(sqsConfig) From 9c26591ec1f48784dbf5c816a3c8e9288dfdf963 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 26 Oct 2023 05:35:52 +0800 Subject: [PATCH 06/43] Add codespell in `make lint` (#4283) Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- boilerplate/flyte/golang_test_targets/Makefile | 5 ++++- doc-requirements.in | 1 + doc-requirements.txt | 11 +++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/boilerplate/flyte/golang_test_targets/Makefile b/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..9b17490323 100644 --- a/boilerplate/flyte/golang_test_targets/Makefile +++ b/boilerplate/flyte/golang_test_targets/Makefile @@ -3,6 +3,9 @@ # # TO OPT OUT OF UPDATES, SEE https://github.com/flyteorg/boilerplate/blob/master/Readme.rst +.PHONY: codespell +codespell: + git ls-files | grep -vE 'go\.mod|go\.sum|flyteidl/|\.pb$$|\.git|\.pdf|\.svg|requirements\.txt|gen/' | xargs codespell -w .PHONY: download_tooling download_tooling: #download dependencies (including test deps) for the package @@ -13,7 +16,7 @@ generate: download_tooling #generate go code @boilerplate/flyte/golang_test_targets/go-gen.sh .PHONY: lint -lint: download_tooling #lints the package for common code smells +lint: codespell download_tooling #lints the package for common code smells GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. diff --git a/doc-requirements.in b/doc-requirements.in index 791b7a9ae7..49f7439659 100644 --- a/doc-requirements.in +++ b/doc-requirements.in @@ -1,3 +1,4 @@ +codespell git+https://github.com/flyteorg/furo@main sphinx sphinx-prompt diff --git a/doc-requirements.txt b/doc-requirements.txt index 557c1958f6..289144e3ad 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile doc-requirements.in @@ -20,6 +20,8 @@ cfgv==3.4.0 # via pre-commit charset-normalizer==3.2.0 # via requests +codespell==2.2.6 + # via -r doc-requirements.in distlib==0.3.7 # via virtualenv docutils==0.17.1 @@ -37,6 +39,8 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx +importlib-metadata==6.8.0 + # via sphinx jinja2==3.0.3 # via # sphinx @@ -92,7 +96,6 @@ sphinx==4.5.0 # sphinx-tags # sphinxcontrib-video # sphinxcontrib-youtube - # sphinxext-remoteliteralinclude sphinx-autoapi==2.0.1 # via -r doc-requirements.in sphinx-basic-ng==1.0.0b2 @@ -133,6 +136,8 @@ sphinxcontrib-youtube==1.2.0 # via -r doc-requirements.in sphinxext-remoteliteralinclude==0.4.0 # via -r doc-requirements.in +typing-extensions==4.8.0 + # via astroid unidecode==1.3.6 # via sphinx-autoapi urllib3==2.0.6 @@ -141,6 +146,8 @@ virtualenv==20.24.5 # via pre-commit wrapt==1.15.0 # via astroid +zipp==3.17.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # setuptools From b30cb723afc7c966516c8460f7a82246b068bea4 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 26 Oct 2023 05:50:05 +0800 Subject: [PATCH 07/43] [Docs] Update Development Environment Setup Guide for MonoRepo (#4257) Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- rsts/community/contribute.rst | 68 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/rsts/community/contribute.rst b/rsts/community/contribute.rst index 6bd877a9c0..accda13b56 100644 --- a/rsts/community/contribute.rst +++ b/rsts/community/contribute.rst @@ -419,48 +419,31 @@ that integrates all Flyte components into a single binary. **4. Build single binary with your own code.** -The following instructions provide guidance on how to build single binary with your customized code, using ``flyteadmin`` as an example. +The following instructions provide guidance on how to build single binary with your customized code under the ``flyteadmin`` as an example. - **Note** Although we'll use ``flyteadmin`` as an example, these steps can be applied to other Flyte components or libraries as well. ``{flyteadmin}`` below can be substituted with other Flyte components/libraries: ``flyteidl``, ``flyteplugins``, ``flytepropeller``, ``datacatalog``, or ``flytestdlib``. - -- **Note** If modifications are needed in multiple components/libraries, the steps will need to be repeated for each component/library. +- **Note** If you want to learn how flyte compiles those components and replace the repositories, you can study how ``go mod edit`` works. .. code:: shell - # Step1: Fork and clone the {flyteadmin} repository, modify the source code accordingly. - git clone https://github.com/flyteorg/flyteadmin.git - cd flyteadmin - - # Step2.1: {Flyteadmin} uses Go 1.19, so make sure to switch to Go 1.19. + # Step1: Install Go. Flyte uses Go 1.19, so make sure to switch to Go 1.19. export PATH=$PATH:$(go env GOPATH)/bin go install golang.org/dl/go1.19@latest go1.19 download export GOROOT=$(go1.19 env GOROOT) export PATH="$GOROOT/bin:$PATH" - - # Step2.2: You may need to install goimports to fix lint errors. + + # You may need to install goimports to fix lint errors. # Refer to https://pkg.go.dev/golang.org/x/tools/cmd/goimports go install golang.org/x/tools/cmd/goimports@latest export PATH=$(go env GOPATH)/bin:$PATH - # Step 3.1: Review the go.mod file in the {flyteadmin} directory to identify the Flyte components/libraries that {flyteadmin} relies on. - # If you have modified any of these components/libraries, use `go mod edit -replace` in the {flyteadmin} repo to replace original components/libraries with your customized ones. - # For instance, if you have also modified `flytepropeller`, run the following commands: - go mod edit -replace github.com/flyteorg/flytepropeller=/home/ubuntu/flytepropeller #replace with your own local path to flytepropeller + # Step2: Go to the {flyteadmin} repository, modify the source code accordingly. + cd flyte/flyteadmin - # Step 3.2: Generate code, fix lint errors and run unit tests for {flyteadmin}. - # Note, flyteidl does not have unit tests, so you can skip the `make test_unit` command. - # Note, flytestdlib only have `make generate` command. - make generate - make lint - make test_unit - - # Step4: Now, you can build the single binary. Go back to Flyte directory, run `go mod edit -replace` to replace the {flyteadmin} code with your own. - go mod edit -replace github.com/flyteorg/flyteadmin=/home/ubuntu/flyteadmin #replace with your own local path to {flyteadmin} - - # Step5: Rebuild and rerun the single binary based on your own code. + # Step3: Now, you can build the single binary. Go back to Flyte directory. go mod tidy make compile POD_NAMESPACE=flyte ./flyte start --config flyte-single-binary-local.yaml @@ -473,12 +456,10 @@ The following instructions provide guidance on how to build single binary with y # Step1: Install flytekit pip install flytekit && export PATH=$PATH:/home/ubuntu/.local/bin - # Step2: The flytesnacks repository provides a lot of useful examples. - git clone https://github.com/flyteorg/flytesnacks && cd flytesnacks - - # Step3: Run a hello world example - pyflyte run --remote examples/basics/basics/hello_world.py my_wf + # Step2: Run a hello world example + pyflyte run --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/fd63f88a55fed4bba846 to see execution in the console. + # You can go to the [flytesnacks repository](https://github.com/flyteorg/flytesnacks) to see more useful examples. **6. Tear down the k3s cluster after finishing developing.** @@ -534,20 +515,32 @@ If not, we can start backends with a single command. source ~/.virtualenvs/flytekit/bin/activate make setup pip install -e . - pip install gsutil awscli - # If you are also developing the plugins, execute the following: + + # If you are also developing the plugins, consider the following: + + # Installing Specific Plugins: + # If you wish to only use few plugins, you can install them individually. + # Take [Flytekit BigQuery Plugin](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-bigquery#flytekit-bigquery-plugin) for example: + # You have to go to the bigquery plugin folder and install it. + cd plugins/flytekit-bigquery/ + pip install -e . + # Now you can use the bigquery plugin, and the performance is fast. + + # (Optional) Installing All Plugins: + # If you wish to install all available plugins, you can execute the command below. + # However, it's not typically recommended because the current version of plugins does not support + # lazy loading. This can lead to a slowdown in the performance of your Python engine. cd plugins pip install -e . + # Now you can use all plugins, but the performance is slow. # Step2: Modify the source code for flytekit, then run unit tests and lint. make lint make test # Step3: Run a hello world sample to test locally - git clone https://github.com/flyteorg/flytesnacks - cd flytesnacks - python3 examples/basics/basics/hello_world.py my_wf - # Running my_wf() hello world + pyflyte run https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf + # Running hello_world_wf() hello world **3. Run workflow in sandbox.** @@ -588,9 +581,8 @@ the Flyte cluster, and finally submit the workflow. docker push ${FLYTE_INTERNAL_IMAGE} # Step4: Submit a hello world workflow to the Flyte cluster - git clone https://github.com/flyteorg/flytesnacks cd flytesnacks - pyflyte run --image ${FLYTE_INTERNAL_IMAGE} --remote examples/basics/basics/hello_world.py my_wf + pyflyte run --image ${FLYTE_INTERNAL_IMAGE} --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/f5c17e1b5640c4336bf8 to see execution in the console. How to setup dev environment for flyteconsole? From b6b0d61faf8b94705228a8a212efc34d4f5e6282 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Fri, 27 Oct 2023 03:38:44 +0800 Subject: [PATCH 08/43] support auto fix packages in golint (#4303) Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- boilerplate/flyte/golang_test_targets/Makefile | 2 +- datacatalog/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flyteadmin/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flytecopilot/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flyteidl/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flyteplugins/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flytepropeller/boilerplate/flyte/golang_test_targets/Makefile | 2 +- flytestdlib/boilerplate/flyte/golang_test_targets/Makefile | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/boilerplate/flyte/golang_test_targets/Makefile b/boilerplate/flyte/golang_test_targets/Makefile index 9b17490323..d3ab9817f7 100644 --- a/boilerplate/flyte/golang_test_targets/Makefile +++ b/boilerplate/flyte/golang_test_targets/Makefile @@ -17,7 +17,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: codespell download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/datacatalog/boilerplate/flyte/golang_test_targets/Makefile b/datacatalog/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/datacatalog/boilerplate/flyte/golang_test_targets/Makefile +++ b/datacatalog/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile b/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile b/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteidl/boilerplate/flyte/golang_test_targets/Makefile b/flyteidl/boilerplate/flyte/golang_test_targets/Makefile index dcd95ef842..71764d8cbf 100644 --- a/flyteidl/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteidl/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile b/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile b/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile b/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' From ef31e55a7978e2cf23f7fa370def90eac5cdbca8 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Thu, 26 Oct 2023 18:09:07 -0700 Subject: [PATCH 09/43] Restructure releases (#4304) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- .github/workflows/bump-tags.yml | 58 ------------- .github/workflows/checks.yml | 44 +--------- .github/workflows/create_release.yml | 87 ++++++++++++------- .github/workflows/generate_flyte_manifest.yml | 10 ++- .github/workflows/publish-images.yml | 41 +++++++++ script/release.sh | 39 +++------ 6 files changed, 125 insertions(+), 154 deletions(-) delete mode 100644 .github/workflows/bump-tags.yml create mode 100644 .github/workflows/publish-images.yml diff --git a/.github/workflows/bump-tags.yml b/.github/workflows/bump-tags.yml deleted file mode 100644 index 2ea6cf0b5a..0000000000 --- a/.github/workflows/bump-tags.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: Bump git tags - -on: - workflow_call: - outputs: - next_version: - description: "Tag name" - value: ${{ jobs.bump_version.outputs.next_version }} - secrets: - FLYTE_BOT_PAT: - required: true -jobs: - bump_version: - name: Bump Version - runs-on: ubuntu-latest - outputs: - next_version: ${{ steps.get_semver.outputs.next_version }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: '0' - # Bump patch version - - uses: rickstaa/action-get-semver@v1 - id: get_semver - with: - bump_level: "patch" - - name: Print current and next versions - run: | - echo "Current version: ${{ steps.get_semver.outputs.current_version }}" - echo "Next version: ${{ steps.get_semver.outputs.next_version }}" - # Generate all tags for all components - - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.FLYTE_BOT_PAT }} - script: | - github.rest.git.createRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: `refs/tags/${{ steps.get_semver.outputs.next_version }}`, - sha: context.sha - }) - const components = [ - "datacatalog", - "flyteadmin", - "flytecopilot", - "flyteidl", - "flyteplugins", - "flytepropeller", - "flytestdlib", - ]; - for (const c of components) { - github.rest.git.createRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: `refs/tags/${c}/${{ steps.get_semver.outputs.next_version }}`, - sha: context.sha - }) - } diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 34b6ca4781..8c235d3d71 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -134,47 +134,11 @@ jobs: component: ${{ matrix.component }} go-version: ${{ needs.unpack-envvars.outputs.go-version }} - bump-tags: - name: Bump git tags - # TODO(monorepo): skip this if author is flyte-bot? - if: ${{ github.event_name != 'pull_request' }} - needs: - - integration - - lint - - unit-tests - - generate - uses: ./.github/workflows/bump-tags.yml - secrets: - FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} - - # TODO(monorepo): we are not going to release any binary - # goreleaser: - # name: Goreleaser - # needs: [ bump_version ] # Only to ensure it can successfully build - # uses: flyteorg/flytetools/.github/workflows/goreleaser.yml@master - # with: - # go-version: "1.19" - # secrets: - # FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} - - push_docker_image: - name: Build & Push Image - needs: [ bump-tags ] - strategy: - fail-fast: false - matrix: - component: - - datacatalog - - flyteadmin - - flytecopilot - - flytepropeller - - flytescheduler - uses: ./.github/workflows/publish.yml + build_docker_images: + name: Build Images + uses: ./.github/workflows/publish-images.yml with: - version: ${{ needs.bump-tags.outputs.next_version }} - component: ${{ matrix.component }} - dockerfile: Dockerfile.${{ matrix.component }} - push: true + push: false secrets: FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 9f807887c2..0323f1d971 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -7,29 +7,57 @@ on: required: true jobs: - bump-version: - name: bump-version + generate-tags: + name: Generate git tags runs-on: ubuntu-latest - outputs: - version: ${{ steps.bump-version.outputs.tag }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: - fetch-depth: "0" + fetch-depth: '0' + - uses: actions/github-script@v6 + with: + github-token: ${{ secrets.FLYTE_BOT_PAT }} + script: | + github.rest.git.createRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `refs/tags/${{ github.event.inputs.version }}`, + sha: context.sha + }) + const components = [ + "datacatalog", + "flyteadmin", + "flytecopilot", + "flyteidl", + "flyteplugins", + "flytepropeller", + "flytestdlib", + ]; + for (const c of components) { + github.rest.git.createRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `refs/tags/${c}/${{ github.event.inputs.version }}`, + sha: context.sha + }) + } - - name: Bump version and push tag - id: bump-version - uses: anothrNick/github-tag-action@1.36.0 - env: - GITHUB_TOKEN: ${{ secrets.FLYTE_BOT_PAT }} - WITH_V: true - CUSTOM_TAG: ${{ github.event.inputs.version }} - RELEASE_BRANCHES: master + build-docker-images: + needs: + - generate-tags + uses: ./.github/workflows/publish-images.yml + with: + version: ${{ github.event.inputs.version }} + push: true + secrets: + FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} + FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} publish-flyte-binary-image: name: Publish flyte binary image for the release version runs-on: ubuntu-latest - needs: [bump-version] + needs: + - generate-tags steps: - name: Checkout uses: actions/checkout@v2 @@ -45,24 +73,25 @@ jobs: - name: Tag image to release version run: | - for release in latest ${{ needs.bump-version.outputs.version }}; do + for release in latest ${{ github.event.inputs.version }}; do docker buildx imagetools create --tag "ghcr.io/${{ github.repository_owner }}/flyte-binary-release:${release}" "ghcr.io/${{ github.repository_owner }}/flyte-binary:sha-${{ github.sha }}" done publish-flyte-component-image: name: Publish flyte component image for the release version runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images strategy: matrix: component: [ + datacatalog, + flyteadmin, flyteconsole, + flytecopilot, flytepropeller, - flyteadmin, - datacatalog, flytescheduler, - flytecopilot, ] steps: - name: Checkout @@ -76,11 +105,7 @@ jobs: - name: Get Latest Version of component id: set_version run: | - if [ ${{ matrix.component }} = "flytecopilot" ]; then - echo ::set-output name=version::$(yq eval '.configmap.copilot.plugins.k8s.co-pilot.image' charts/flyte-core/values.yaml | cut -d ":" -f 2 ) - else - echo ::set-output name=version::$(yq eval '.${{ matrix.component }}.image.tag' charts/flyte-core/values.yaml) - fi + echo ::set-output name=version::$(yq eval '.${{ matrix.component }}.image.tag' charts/flyte-core/values.yaml) shell: bash - name: Login to GitHub Container Registry @@ -92,14 +117,15 @@ jobs: - name: Tag Image to release version run: | - for release in latest ${{ needs.bump-version.outputs.version }}; do + for release in latest ${{ github.event.inputs.version }}; do docker buildx imagetools create --tag "ghcr.io/${{ github.repository_owner }}/${{ matrix.component }}-release:${release}" "ghcr.io/${{ github.repository_owner }}/${{ matrix.component }}:${{ steps.set_version.outputs.version }}" done helm-release: name: Flyte helm release runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images steps: - name: Checkout uses: actions/checkout@v2 @@ -118,7 +144,7 @@ jobs: git config user.email "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>" - name: Prepare Flyte Helm Release env: - VERSION: ${{ needs.bump-version.outputs.version }} + VERSION: ${{ github.event.inputs.version }} REPOSITORY: "https://flyteorg.github.io/flyte" run: | make prepare_artifacts @@ -131,7 +157,8 @@ jobs: manifest-release: name: Flyte manifest release runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images steps: - name: Checkout uses: actions/checkout@v2 @@ -140,7 +167,7 @@ jobs: - name: Prepare Flyte Release env: - VERSION: ${{ needs.bump-version.outputs.version }} + VERSION: ${{ github.event.inputs.version }} run: | make prepare_artifacts git stash diff --git a/.github/workflows/generate_flyte_manifest.yml b/.github/workflows/generate_flyte_manifest.yml index cf5e4e3123..913da71a7a 100644 --- a/.github/workflows/generate_flyte_manifest.yml +++ b/.github/workflows/generate_flyte_manifest.yml @@ -2,11 +2,17 @@ name: Generate Flyte manifest on: workflow_dispatch: + inputs: + next-version: + description: "version name. example v0.1.1" + required: true jobs: update-flyte-releases: name: Update Flyte components runs-on: ubuntu-latest + needs: + - generate-tags steps: - uses: actions/checkout@v2 with: @@ -14,7 +20,9 @@ jobs: - uses: actions/setup-go@v2 with: go-version: "1.19" - - name: Build kustomize + - name: Update references + env: + VERSION: ${{ github.event.inputs.next-version }} run: | make release_automation make kustomize diff --git a/.github/workflows/publish-images.yml b/.github/workflows/publish-images.yml new file mode 100644 index 0000000000..6c12e9a0ce --- /dev/null +++ b/.github/workflows/publish-images.yml @@ -0,0 +1,41 @@ +name: Build & Push All Components Images + +on: + workflow_call: + inputs: + version: + description: "Version of image" + required: false + type: string + default: ci-checks + push: + description: "Push to registry" + required: true + type: boolean + secrets: + FLYTE_BOT_USERNAME: + required: true + FLYTE_BOT_PAT: + required: true + +jobs: + push_docker_image: + name: Build & Push Image + strategy: + fail-fast: false + matrix: + component: + - datacatalog + - flyteadmin + - flytecopilot + - flytepropeller + - flytescheduler + uses: ./.github/workflows/publish.yml + with: + version: ${{ inputs.version }} + component: ${{ matrix.component }} + dockerfile: Dockerfile.${{ matrix.component }} + push: ${{ inputs.push }} + secrets: + FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} + FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} diff --git a/script/release.sh b/script/release.sh index 190ecbc8f1..1f3d763fee 100755 --- a/script/release.sh +++ b/script/release.sh @@ -2,44 +2,33 @@ set -ex -# Get latest release tag (minus the component prefix) -# TODO(monorepo): This only works if we have at least one component tag per release. -# In other words, if we have two consecutive releases the latest tag in the second release is going to point to an invalid -# tag (because there will not be images tagged with the previous release tag). -LATEST_TAG=$(git tag | sed 's#[^/]*/##' | sort --version-sort | tail -n1) FLYTEKIT_TAG=$(curl --silent "https://api.github.com/repos/flyteorg/flytekit/releases/latest" | jq -r .tag_name | sed 's/^v//') FLYTECONSOLE_TAG=$(curl --silent "https://api.github.com/repos/flyteorg/flyteconsole/releases/latest" | jq -r .tag_name) # bump latest release of flyte component in kustomize -grep -rlZ "newTag:[^P]*# FLYTEADMIN_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEADMIN_TAG/newTag: ${LATEST_TAG} # FLYTEADMIN_TAG/g" {} -grep -rlZ "newTag:[^P]*# DATACATALOG_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# DATACATALOG_TAG/newTag: ${LATEST_TAG} # DATACATALOG_TAG/g" {} +grep -rlZ "newTag:[^P]*# FLYTEADMIN_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEADMIN_TAG/newTag: ${VERSION} # FLYTEADMIN_TAG/g" {} +grep -rlZ "newTag:[^P]*# DATACATALOG_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# DATACATALOG_TAG/newTag: ${VERSION} # DATACATALOG_TAG/g" {} grep -rlZ "newTag:[^P]*# FLYTECONSOLE_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTECONSOLE_TAG/newTag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG/g" {} -grep -rlZ "newTag:[^P]*# FLYTEPROPELLER_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEPROPELLER_TAG/newTag: ${LATEST_TAG} # FLYTEPROPELLER_TAG/g" {} +grep -rlZ "newTag:[^P]*# FLYTEPROPELLER_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEPROPELLER_TAG/newTag: ${VERSION} # FLYTEPROPELLER_TAG/g" {} # bump latest release of flyte component in helm -sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${LATEST_TAG} # FLYTEADMIN_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${LATEST_TAG} # FLYTEADMIN_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${VERSION} # FLYTEADMIN_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${VERSION} # FLYTEADMIN_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${LATEST_TAG} # FLYTESCHEDULER_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${LATEST_TAG} # FLYTESCHEDULER_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${VERSION} # FLYTESCHEDULER_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${VERSION} # FLYTESCHEDULER_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${LATEST_TAG} # DATACATALOG_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${LATEST_TAG} # DATACATALOG_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${VERSION} # DATACATALOG_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${VERSION} # DATACATALOG_TAG," ./charts/flyte-core/values.yaml sed -i "s,tag:[^P]*# FLYTECONSOLE_TAG,tag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG," ./charts/flyte/values.yaml sed -i "s,tag:[^P]*# FLYTECONSOLE_TAG,tag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${LATEST_TAG} # FLYTEPROPELLER_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${LATEST_TAG} # FLYTEPROPELLER_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${VERSION} # FLYTEPROPELLER_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${VERSION} # FLYTEPROPELLER_TAG," ./charts/flyte-core/values.yaml -sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${LATEST_TAG} # FLYTECOPILOT_IMAGE," ./charts/flyte/values.yaml -sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${LATEST_TAG} # FLYTECOPILOT_IMAGE," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTECOPILOT_TAG,tag: ${LATEST_TAG} # FLYTECOPILOT_TAG," ./charts/flyte-binary/values.yaml +sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${VERSION} # FLYTECOPILOT_IMAGE," ./charts/flyte/values.yaml +sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${VERSION} # FLYTECOPILOT_IMAGE," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTECOPILOT_TAG,tag: ${VERSION} # FLYTECOPILOT_TAG," ./charts/flyte-binary/values.yaml sed -i "s,tag:[^P]*# FLYTEAGENT_TAG,tag: ${FLYTEKIT_TAG} # FLYTEAGENT_TAG," ./charts/flyteagent/values.yaml - -go get github.com/flyteorg/flyte/flyteadmin@${LATEST_TAG} -go get github.com/flyteorg/flyte/flytepropeller@${LATEST_TAG} -go get github.com/flyteorg/flyte/datacatalog@${LATEST_TAG} -go get github.com/flyteorg/flyte/flytestdlib@${LATEST_TAG} -go mod tidy From 494f8d9538c93500b4215015fc98e2bfe64757a7 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Thu, 26 Oct 2023 18:09:59 -0700 Subject: [PATCH 10/43] Use debian bookworm as base image (#4311) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ea5cba63f3..0134bd8cea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ARG FLYTECONSOLE_VERSION=latest FROM ghcr.io/flyteorg/flyteconsole:${FLYTECONSOLE_VERSION} AS flyteconsole -FROM --platform=${BUILDPLATFORM} golang:1.19.1-bullseye AS flytebuilder +FROM --platform=${BUILDPLATFORM} golang:1.19.13-bookworm AS flytebuilder ARG TARGETARCH ENV GOARCH "${TARGETARCH}" @@ -26,7 +26,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/r go build -tags console -v -o dist/flyte cmd/main.go -FROM debian:bullseye-slim +FROM debian:bookworm-slim ARG FLYTE_VERSION ENV FLYTE_VERSION "${FLYTE_VERSION}" From 10d0775b760a0400ff8f01355358602efd3f3cf4 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Fri, 27 Oct 2023 13:39:03 -0700 Subject: [PATCH 11/43] Use local version in single-binary (#4294) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- go.mod | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 7b0855bb1f..c2b49c7f95 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,10 @@ module github.com/flyteorg/flyte go 1.19 require ( - github.com/flyteorg/flyte/datacatalog v1.9.37 - github.com/flyteorg/flyte/flyteadmin v1.9.37 - github.com/flyteorg/flyte/flytepropeller v1.9.37 - github.com/flyteorg/flyte/flytestdlib v1.9.37 + github.com/flyteorg/flyte/datacatalog v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flyteadmin v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flytepropeller v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flytestdlib v0.0.0-00010101000000-000000000000 github.com/golang/glog v1.1.0 github.com/prometheus/client_golang v1.16.0 github.com/spf13/cobra v1.7.0 From 4e3421406491f546944ba67b2382b062363ff561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michelle=20=22MishManners=C2=AE=E2=84=A2=22=20Mannering?= <36594527+mishmanners@users.noreply.github.com> Date: Mon, 30 Oct 2023 15:47:33 +1100 Subject: [PATCH 12/43] Accessibility for README (#4322) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add accessibility features based on: https://github.blog/2023-10-26-5-tips-for-making-your-github-profile-page-accessible/ Signed-off-by: Michelle "MishManners®™" Mannering <36594527+mishmanners@users.noreply.github.com> --- README.md | 94 +++++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index ba0a5537e0..b83a3f81c5 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

- 🏗️ 🚀 📈 + :building_construction: :rocket: :chart_with_upwards_trend:

@@ -20,21 +20,21 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr

- Current Release + Current Release label - Sandbox Status + Sandbox Status label - Test Status + Test Status label - License + License label - OpenSSF Best Practices + OpenSSF Best Practices label - Flyte Helm Chart + Flyte Helm Chart label - - Flyte Slack + + Flyte Slack label

@@ -47,7 +47,7 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr
-Getting started with Flyte +Getting started with Flyte, showing the welcome screen and Flyte dashboard --- @@ -55,34 +55,34 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr We ship new features, bug fixes and performance improvements regularly. Read our [release notes](https://github.com/flyteorg/flyte/releases) to stay updated. -🚀 **Strongly typed interfaces**: Validate your data at every step of the workflow by defining data guardrails using Flyte types.
-🌐 **Any language**: Write code in any language using raw containers, or choose [Python](https://github.com/flyteorg/flytekit), [Java](https://github.com/flyteorg/flytekit-java), [Scala](https://github.com/flyteorg/flytekit-java) or [JavaScript](https://github.com/NotMatthewGriffin/pterodactyl) SDKs to develop your Flyte workflows.
-📊 **Map tasks**: Achieve parallel code execution with minimal configuration using [map tasks](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/map_task.html).
-🌟 **Dynamic workflows**: [Build flexible and adaptable workflows](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/dynamics.html) that can change and evolve as needed, making it easier to respond to changing requirements.
-🌳 **Branching**: [Selectively execute branches](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/conditions.html) of your workflow based on static or dynamic data produced by other tasks or input data.
-📂 **FlyteFile & FlyteDirectory**: Transfer [files](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/files.html) and [directories](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/folders.html) between local and cloud storage.
-🗃️ **Structured dataset**: Convert dataframes between types and enforce column-level type checking using the abstract 2D representation provided by [Structured Dataset](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/data_types_and_io/structured_dataset.html).
-🛡️ **Recover from failures**: Recover only the failed tasks.
-🔁 **Rerun a single task**: Rerun workflows at the most granular level without modifying the previous state of a data/ML workflow.
-🚦 **Versioned workflows**: Reproduce results and roll back to a previous workflow version any time.
-🔍 **Cache outputs**: Cache task outputs by passing `cache=True` to the task decorator.
-🚩 **Intra-task checkpointing**: [Checkpoint progress](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/checkpoint.html) within a task execution.
-🌎 **Multi-tenancy**: Multiple users can share the same platform while maintaining their own distinct data and configurations.
-⏰ **Timeout**: Define a timeout period, after which the task is marked as failure.
-🔒 **Immutability**: Immutable executions help ensure reproducibility by preventing any changes to the state of an execution.
-🧬 **Data lineage**: Track the movement and transformation of data throughout the lifecycle of your data and ML workflows.
-📈 **Data visualization**: Visualize data, monitor models and view training history through plots.
-🏭 **Dev to prod**: As simple as changing your [domain](https://docs.flyte.org/en/latest/concepts/domains.html) from development or staging to production.
-💸 **Spot or preemptible instances**: Schedule your workflows on spot instances by setting `interruptible` to `True` in the task decorator.
-☁️ **Cloud-native deployment**: Deploy Flyte on AWS, GCP, Azure and other cloud services.
-📅 **Scheduling**: [Schedule](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_schedules.html) your data and ML workflows to run at a specific time.
-📢 **Notifications**: Stay informed about changes to your workflow's state by configuring [notifications](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_notifications.html) through Slack, PagerDuty or email.
-⌛️ **Timeline view**: Evaluate the duration of each of your Flyte tasks and identify potential bottlenecks.
-💨 **GPU acceleration**: Enable and control your tasks’ GPU demands by requesting resources in the task decorator.
-🐳 **Dependency isolation via containers**: Maintain separate sets of dependencies for your tasks so no dependency conflicts arise.
-🔀 **Parallelism**: Flyte tasks are inherently parallel to optimize resource consumption and improve performance.
-💾 **Allocate resources dynamically** at the task level.
-⏯️ [Wait](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/waiting_for_external_inputs.html) for **external inputs** before proceeding with the execution.
+:rocket: **Strongly typed interfaces**: Validate your data at every step of the workflow by defining data guardrails using Flyte types.
+:globe_with_meridians: **Any language**: Write code in any language using raw containers, or choose [Python](https://github.com/flyteorg/flytekit), [Java](https://github.com/flyteorg/flytekit-java), [Scala](https://github.com/flyteorg/flytekit-java) or [JavaScript](https://github.com/NotMatthewGriffin/pterodactyl) SDKs to develop your Flyte workflows.
+:bar_chart: **Map tasks**: Achieve parallel code execution with minimal configuration using [map tasks](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/map_task.html).
+:star2: **Dynamic workflows**: [Build flexible and adaptable workflows](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/dynamics.html) that can change and evolve as needed, making it easier to respond to changing requirements.
+:deciduous_tree: **Branching**: [Selectively execute branches](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/conditions.html) of your workflow based on static or dynamic data produced by other tasks or input data.
+:open_file_folder: **FlyteFile & FlyteDirectory**: Transfer [files](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/files.html) and [directories](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/folders.html) between local and cloud storage.
+:card_file_box: **Structured dataset**: Convert dataframes between types and enforce column-level type checking using the abstract 2D representation provided by [Structured Dataset](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/data_types_and_io/structured_dataset.html).
+:shield: **Recover from failures**: Recover only the failed tasks.
+:arrows_counterclockwise: **Rerun a single task**: Rerun workflows at the most granular level without modifying the previous state of a data/ML workflow.
+:vertical_traffic_light: **Versioned workflows**: Reproduce results and roll back to a previous workflow version any time.
+:mag: **Cache outputs**: Cache task outputs by passing `cache=True` to the task decorator.
+:triangular_flag_on_post: **Intra-task checkpointing**: [Checkpoint progress](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/checkpoint.html) within a task execution.
+:earth_americas: **Multi-tenancy**: Multiple users can share the same platform while maintaining their own distinct data and configurations.
+:alarm_clock: **Timeout**: Define a timeout period, after which the task is marked as failure.
+:lock: **Immutability**: Immutable executions help ensure reproducibility by preventing any changes to the state of an execution.
+:dna: **Data lineage**: Track the movement and transformation of data throughout the lifecycle of your data and ML workflows.
+:chart_with_upwards_trend: **Data visualization**: Visualize data, monitor models and view training history through plots.
+:factory: **Dev to prod**: As simple as changing your [domain](https://docs.flyte.org/en/latest/concepts/domains.html) from development or staging to production.
+:money_with_wings: **Spot or preemptible instances**: Schedule your workflows on spot instances by setting `interruptible` to `True` in the task decorator.
+:cloud: **Cloud-native deployment**: Deploy Flyte on AWS, GCP, Azure and other cloud services.
+:calendar: **Scheduling**: [Schedule](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_schedules.html) your data and ML workflows to run at a specific time.
+:mega: **Notifications**: Stay informed about changes to your workflow's state by configuring [notifications](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_notifications.html) through Slack, PagerDuty or email.
+:hourglass: **Timeline view**: Evaluate the duration of each of your Flyte tasks and identify potential bottlenecks.
+:fast_forward: **GPU acceleration**: Enable and control your tasks’ GPU demands by requesting resources in the task decorator.
+:whale: **Dependency isolation via containers**: Maintain separate sets of dependencies for your tasks so no dependency conflicts arise.
+:arrows_counterclockwise: **Parallelism**: Flyte tasks are inherently parallel to optimize resource consumption and improve performance.
+:floppy_disk: **Allocate resources dynamically** at the task level.
+:play_or_pause_button: [Wait](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/waiting_for_external_inputs.html) for **external inputs** before proceeding with the execution.
## Quick start @@ -90,7 +90,7 @@ If you want to try out Flyte, the easiest way to get started is by using the Fly However, if you prefer to install Flyte locally and run a workflow, our [getting started guide](https://docs.flyte.org/projects/cookbook/en/latest/index.html) is the best place to start. It provides step-by-step instructions on how to install Flyte locally and run your first workflow. -> If you're unsure what a Flyte [task](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/task.html) and [workflow](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/workflow.html) are, be sure to check out our guides on both! +> If you're unsure what either a [Flyte task](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/task.html), or a [workflow](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/workflow.html) is, be sure to check out our guides on both! ### Deploy Flyte to production @@ -103,13 +103,13 @@ Join the likes of LinkedIn, Spotify, Freenome, Pachama, Gojek, and Woven Planet ## Resources -📆 [Weekly office hours](https://calendly.com/flyte-office-hours-01/30min): Live informal sessions with the Flyte team held every week. Book a 30-minute slot and get your questions answered.
-👥 [Biweekly community sync](https://www.addevent.com/event/EA7823958): A biweekly event where the Flyte team provides updates on the project and community members can share their progress and ask questions.
-💬 [Slack](https://slack.flyte.org/): Join the Flyte community on Slack to chat with other users, ask questions, and get help.
-⚠️ [GitHub](https://github.com/flyteorg/flyte): Check out the Flyte project on GitHub to file issues, contribute code, and stay up to date on the latest development.
-📹 [Youtube](https://www.youtube.com/channel/UCNduEoLOToNo3nFVly-vUTQ): Tune into panel discussions, customer success stories, community updates and feature deep dives.
-📄 [Blog](https://flyte.org/blog): Here, you can find tutorials and feature deep dives to help you learn more about Flyte.
-💡 [RFCs](rfc/.): RFCs are used for proposing new ideas and features to improve Flyte. You can refer to them to stay updated on the latest developments and contribute to the growth of the platform. +:spiral_calendar: [Weekly office hours](https://calendly.com/flyte-office-hours-01/30min): Live informal sessions with the Flyte team held every week. Book a 30-minute slot and get your questions answered.
+:couple: [Biweekly community sync](https://www.addevent.com/event/EA7823958): A biweekly event where the Flyte team provides updates on the project and community members can share their progress and ask questions.
+:speech_balloon: [Slack](https://slack.flyte.org/): Join the Flyte community on Slack to chat with other users, ask questions, and get help.
+:warning: [GitHub](https://github.com/flyteorg/flyte): Check out the Flyte project on GitHub to file issues, contribute code, and stay up to date on the latest development.
+:video_camera: [Youtube](https://www.youtube.com/channel/UCNduEoLOToNo3nFVly-vUTQ): Tune into panel discussions, customer success stories, community updates and feature deep dives.
+:writing_hand: [Blog](https://flyte.org/blog): Here, you can find tutorials and feature deep dives to help you learn more about Flyte.
+:bulb: [RFCs](rfc/.): RFCs are used for proposing new ideas and features to improve Flyte. You can refer to them to stay updated on the latest developments and contribute to the growth of the platform. ## How to contribute @@ -121,7 +121,7 @@ There are many ways to get involved in Flyte, including: - Taking on a [`help wanted`](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22+) or [`good-first-issue`](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) and following the [CONTRIBUTING](https://docs.flyte.org/en/latest/community/contribute.html) guide to submit changes to the codebase. - Upvoting [popular feature requests](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement+sort%3Areactions-%2B1-desc) to show your support. -### We ❤️ our contributors +### We :heart: our contributors From ac4ec7d84e88fe8a00a4b850c30a078deaee7f3e Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 30 Oct 2023 15:53:06 +0800 Subject: [PATCH 13/43] Add tests in `flytepropeller/pkg /controller/executors` from 72.3% to 87.3% coverage (#4276) * add to 87.3% coverage Signed-off-by: Future Outlier * update to the latest pr: update k8s dependencies Signed-off-by: Future Outlier --------- Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- .../executors/execution_context_test.go | 29 +++++++++++++++++++ .../controller/executors/node_lookup_test.go | 15 ++++++++++ 2 files changed, 44 insertions(+) diff --git a/flytepropeller/pkg/controller/executors/execution_context_test.go b/flytepropeller/pkg/controller/executors/execution_context_test.go index 3639057074..6a1561ea7c 100644 --- a/flytepropeller/pkg/controller/executors/execution_context_test.go +++ b/flytepropeller/pkg/controller/executors/execution_context_test.go @@ -63,3 +63,32 @@ func TestExecutionContext(t *testing.T) { assert.Equal(t, typed.TaskDetailsGetter, taskGetter) assert.Equal(t, typed.GetParentInfo(), immutableParentInfo2) } + +func TestParentExecutionInfo_GetUniqueID(t *testing.T) { + expectedID := "testID" + parentInfo := NewParentInfo(expectedID, 1) + assert.Equal(t, expectedID, parentInfo.GetUniqueID()) +} + +func TestParentExecutionInfo_CurrentAttempt(t *testing.T) { + expectedAttempt := uint32(123465) + parentInfo := NewParentInfo("testID", expectedAttempt) + assert.Equal(t, expectedAttempt, parentInfo.CurrentAttempt()) +} + +func TestControlFlow_ControlFlowParallelism(t *testing.T) { + cFlow := InitializeControlFlow().(*controlFlow) + assert.Equal(t, uint32(0), cFlow.CurrentParallelism()) + cFlow.IncrementParallelism() + assert.Equal(t, uint32(1), cFlow.CurrentParallelism()) + cFlow.IncrementParallelism() + assert.Equal(t, uint32(2), cFlow.CurrentParallelism()) +} + +func TestNewParentInfo(t *testing.T) { + expectedID := "testID" + expectedAttempt := uint32(123465) + parentInfo := NewParentInfo(expectedID, expectedAttempt).(*parentExecutionInfo) + assert.Equal(t, expectedID, parentInfo.uniqueID) + assert.Equal(t, expectedAttempt, parentInfo.currentAttempts) +} diff --git a/flytepropeller/pkg/controller/executors/node_lookup_test.go b/flytepropeller/pkg/controller/executors/node_lookup_test.go index 889ea62236..885c3262c4 100644 --- a/flytepropeller/pkg/controller/executors/node_lookup_test.go +++ b/flytepropeller/pkg/controller/executors/node_lookup_test.go @@ -48,3 +48,18 @@ func TestNewTestNodeLookup(t *testing.T) { assert.False(t, ok) assert.NotEqual(t, ns, nl.GetNodeExecutionStatus(context.TODO(), "n")) } + +func TestToNodeAndFromNode(t *testing.T) { + n := &mocks.ExecutableNode{} + ns := &mocks.ExecutableNodeStatus{} + nl := NewTestNodeLookup(map[string]v1alpha1.ExecutableNode{"n1": n}, map[string]v1alpha1.ExecutableNodeStatus{"n1": ns}) + + nodeIds, err := nl.ToNode("n1") + assert.Nil(t, nodeIds) + assert.Nil(t, err) + + nodeIds, err = nl.FromNode("n1") + assert.Nil(t, nodeIds) + assert.Nil(t, err) + +} From d4981f1d87c1e287a2e9a5d5de376006e7465a3a Mon Sep 17 00:00:00 2001 From: HeetVekariya <91054457+HeetVekariya@users.noreply.github.com> Date: Mon, 30 Oct 2023 14:39:45 +0530 Subject: [PATCH 14/43] fix: remove unused setting in deployment charts (#4252) * fix: remove unused setting in deployment charts Signed-off-by: HeetVekariya * fix: update code with make helm Signed-off-by: HeetVekariya --------- Signed-off-by: HeetVekariya --- charts/flyte-core/templates/datacatalog/service.yaml | 4 ---- deployment/eks/flyte_aws_scheduler_helm_generated.yaml | 4 ---- deployment/eks/flyte_helm_controlplane_generated.yaml | 4 ---- deployment/eks/flyte_helm_generated.yaml | 4 ---- deployment/gcp/flyte_helm_controlplane_generated.yaml | 4 ---- deployment/gcp/flyte_helm_generated.yaml | 4 ---- deployment/sandbox/flyte_helm_generated.yaml | 4 ---- 7 files changed, 28 deletions(-) diff --git a/charts/flyte-core/templates/datacatalog/service.yaml b/charts/flyte-core/templates/datacatalog/service.yaml index 7ea5c84412..494d7c2b38 100644 --- a/charts/flyte-core/templates/datacatalog/service.yaml +++ b/charts/flyte-core/templates/datacatalog/service.yaml @@ -13,10 +13,6 @@ spec: type: {{ . }} {{- end }} ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml index a05c6869df..4ac504ff6f 100644 --- a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml +++ b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml @@ -797,10 +797,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/eks/flyte_helm_controlplane_generated.yaml b/deployment/eks/flyte_helm_controlplane_generated.yaml index 323176fea1..78bc58c88e 100644 --- a/deployment/eks/flyte_helm_controlplane_generated.yaml +++ b/deployment/eks/flyte_helm_controlplane_generated.yaml @@ -521,10 +521,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/eks/flyte_helm_generated.yaml b/deployment/eks/flyte_helm_generated.yaml index 1139ce2fbc..ba14d17673 100644 --- a/deployment/eks/flyte_helm_generated.yaml +++ b/deployment/eks/flyte_helm_generated.yaml @@ -828,10 +828,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/gcp/flyte_helm_controlplane_generated.yaml b/deployment/gcp/flyte_helm_controlplane_generated.yaml index 27d1319bf4..66ce16f784 100644 --- a/deployment/gcp/flyte_helm_controlplane_generated.yaml +++ b/deployment/gcp/flyte_helm_controlplane_generated.yaml @@ -536,10 +536,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/gcp/flyte_helm_generated.yaml b/deployment/gcp/flyte_helm_generated.yaml index ac06058686..2144bb6ee5 100644 --- a/deployment/gcp/flyte_helm_generated.yaml +++ b/deployment/gcp/flyte_helm_generated.yaml @@ -851,10 +851,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/deployment/sandbox/flyte_helm_generated.yaml b/deployment/sandbox/flyte_helm_generated.yaml index bd5a1ebf83..82808bf384 100644 --- a/deployment/sandbox/flyte_helm_generated.yaml +++ b/deployment/sandbox/flyte_helm_generated.yaml @@ -6196,10 +6196,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP From a7fc66481a52a3428947f6eb4e72930448d3611d Mon Sep 17 00:00:00 2001 From: "Fabio M. Graetz, Ph.D" Date: Mon, 30 Oct 2023 16:29:15 +0100 Subject: [PATCH 15/43] Document simplified retry behaviour introduced in #3902 (#4022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Document simplified retry behaviour introduced in #3902 Signed-off-by: Fabio M. Graetz, Ph.D. Signed-off-by: Fabio Grätz * Update rsts/concepts/tasks.rst Signed-off-by: Fabio M. Graetz, Ph.D. Signed-off-by: Fabio Grätz --------- Signed-off-by: Fabio M. Graetz, Ph.D. Signed-off-by: Fabio Grätz --- rsts/concepts/tasks.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rsts/concepts/tasks.rst b/rsts/concepts/tasks.rst index deff1b30fa..cd1059d2f4 100644 --- a/rsts/concepts/tasks.rst +++ b/rsts/concepts/tasks.rst @@ -106,6 +106,10 @@ System retry can be of two types: Recoverable vs. Non-Recoverable failures: Recoverable failures will be retried and counted against the task's retry count. Non-recoverable failures will just fail, i.e., the task isn’t retried irrespective of user/system retry configurations. All user exceptions are considered non-recoverable unless the exception is a subclass of FlyteRecoverableException. +.. note:: + + `RFC 3902 `_ implements an alternative, simplified retry behaviour with which both system and user retries are counted towards a single retry budget defined in the task decorator (thus, without a second retry budget defined in the platform configuration). The last retries are always performed on non-spot instances to guarantee completion. To activate this behaviour, set ``configmap.core.propeller.node-config.ignore-retry-cause`` to ``true`` in the helm values. + **Timeouts** To ensure that the system is always making progress, tasks must be guaranteed to end gracefully/successfully. The system defines a default timeout period for the tasks. It is possible for task authors to define a timeout period, after which the task is marked as ``failure``. Note that a timed-out task will be retried if it has a retry strategy defined. The timeout can be handled in the `TaskMetadata `__. From 1b92105d0750da88414962237e530a16e573f81c Mon Sep 17 00:00:00 2001 From: Jeev B Date: Tue, 31 Oct 2023 21:17:13 -0700 Subject: [PATCH 16/43] Add support for capturing Ray job logs via a sidecar (#4266) Signed-off-by: Jeev B --- .../go/tasks/plugins/k8s/ray/config.go | 3 + flyteplugins/go/tasks/plugins/k8s/ray/ray.go | 113 ++++++-- .../go/tasks/plugins/k8s/ray/ray_test.go | 268 ++++++++++++++++-- 3 files changed, 337 insertions(+), 47 deletions(-) diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/config.go b/flyteplugins/go/tasks/plugins/k8s/ray/config.go index 10a8068344..e123c5b8ab 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/config.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/config.go @@ -3,6 +3,8 @@ package ray import ( "context" + v1 "k8s.io/api/core/v1" + pluginsConfig "github.com/flyteorg/flyte/flyteplugins/go/tasks/config" "github.com/flyteorg/flyte/flyteplugins/go/tasks/logs" pluginmachinery "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s" @@ -78,6 +80,7 @@ type Config struct { // Remote Ray Cluster Config RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"` Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"` + LogsSidecar *v1.Container `json:"logsSidecar" pflag:"-,Sidecar to inject into head pods for capturing ray job logs"` Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"` EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"` } diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go index 22840aa909..c1dcc2b8e2 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go @@ -26,6 +26,8 @@ import ( ) const ( + rayStateMountPath = "/tmp/ray" + defaultRayStateVolName = "system-ray-state" rayTaskType = "ray" KindRayJob = "RayJob" IncludeDashboard = "include-dashboard" @@ -61,17 +63,18 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to create pod spec: [%v]", err.Error()) } - var container v1.Container - found := false - for _, c := range podSpec.Containers { + var primaryContainer *v1.Container + var primaryContainerIdx int + for idx, c := range podSpec.Containers { if c.Name == primaryContainerName { - container = c - found = true + c := c + primaryContainer = &c + primaryContainerIdx = idx break } } - if !found { + if primaryContainer == nil { return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to get primary container from the pod: [%v]", err.Error()) } @@ -101,9 +104,15 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC } enableIngress := true + headPodSpec := podSpec.DeepCopy() rayClusterSpec := rayv1alpha1.RayClusterSpec{ HeadGroupSpec: rayv1alpha1.HeadGroupSpec{ - Template: buildHeadPodTemplate(&container, podSpec, objectMeta, taskCtx), + Template: buildHeadPodTemplate( + &headPodSpec.Containers[primaryContainerIdx], + headPodSpec, + objectMeta, + taskCtx, + ), ServiceType: v1.ServiceType(cfg.ServiceType), Replicas: &headReplicas, EnableIngress: &enableIngress, @@ -113,7 +122,13 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC } for _, spec := range rayJob.RayCluster.WorkerGroupSpec { - workerPodTemplate := buildWorkerPodTemplate(&container, podSpec, objectMeta, taskCtx) + workerPodSpec := podSpec.DeepCopy() + workerPodTemplate := buildWorkerPodTemplate( + &workerPodSpec.Containers[primaryContainerIdx], + workerPodSpec, + objectMeta, + taskCtx, + ) minReplicas := spec.Replicas maxReplicas := spec.Replicas @@ -161,7 +176,7 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC jobSpec := rayv1alpha1.RayJobSpec{ RayClusterSpec: rayClusterSpec, - Entrypoint: strings.Join(container.Args, " "), + Entrypoint: strings.Join(primaryContainer.Args, " "), ShutdownAfterJobFinishes: cfg.ShutdownAfterJobFinishes, TTLSecondsAfterFinished: &cfg.TTLSecondsAfterFinished, RuntimeEnv: rayJob.RuntimeEnv, @@ -179,10 +194,66 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC return &rayJobObject, nil } -func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMeta *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { +func injectLogsSidecar(primaryContainer *v1.Container, podSpec *v1.PodSpec) { + cfg := GetConfig() + if cfg.LogsSidecar == nil { + return + } + sidecar := cfg.LogsSidecar.DeepCopy() + + // Ray logs integration + var rayStateVolMount *v1.VolumeMount + // Look for an existing volume mount on the primary container, mounted at /tmp/ray + for _, vm := range primaryContainer.VolumeMounts { + if vm.MountPath == rayStateMountPath { + vm := vm + rayStateVolMount = &vm + break + } + } + // No existing volume mount exists at /tmp/ray. We create a new volume and volume + // mount and add it to the pod and container specs respectively + if rayStateVolMount == nil { + vol := v1.Volume{ + Name: defaultRayStateVolName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{}, + }, + } + podSpec.Volumes = append(podSpec.Volumes, vol) + volMount := v1.VolumeMount{ + Name: defaultRayStateVolName, + MountPath: rayStateMountPath, + } + primaryContainer.VolumeMounts = append(primaryContainer.VolumeMounts, volMount) + rayStateVolMount = &volMount + } + // We need to mirror the ray state volume mount into the sidecar as readonly, + // so that we can read the logs written by the head node. + readOnlyRayStateVolMount := *rayStateVolMount.DeepCopy() + readOnlyRayStateVolMount.ReadOnly = true + + // Update volume mounts on sidecar + // If one already exists with the desired mount path, simply replace it. Otherwise, + // add it to sidecar's volume mounts. + foundExistingSidecarVolMount := false + for idx, vm := range sidecar.VolumeMounts { + if vm.MountPath == rayStateMountPath { + foundExistingSidecarVolMount = true + sidecar.VolumeMounts[idx] = readOnlyRayStateVolMount + } + } + if !foundExistingSidecarVolMount { + sidecar.VolumeMounts = append(sidecar.VolumeMounts, readOnlyRayStateVolMount) + } + + // Add sidecar to containers + podSpec.Containers = append(podSpec.Containers, *sidecar) +} + +func buildHeadPodTemplate(primaryContainer *v1.Container, podSpec *v1.PodSpec, objectMeta *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { // Some configs are copy from https://github.com/ray-project/kuberay/blob/b72e6bdcd9b8c77a9dc6b5da8560910f3a0c3ffd/apiserver/pkg/util/cluster.go#L97 // They should always be the same, so we could hard code here. - primaryContainer := container.DeepCopy() primaryContainer.Name = "ray-head" envs := []v1.EnvVar{ @@ -217,12 +288,11 @@ func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMe primaryContainer.Ports = append(primaryContainer.Ports, ports...) - headPodSpec := podSpec.DeepCopy() - - headPodSpec.Containers = []v1.Container{*primaryContainer} + // Inject a sidecar for capturing and exposing Ray job logs + injectLogsSidecar(primaryContainer, podSpec) podTemplateSpec := v1.PodTemplateSpec{ - Spec: *headPodSpec, + Spec: *podSpec, ObjectMeta: *objectMeta, } cfg := config.GetK8sPluginConfig() @@ -231,7 +301,7 @@ func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMe return podTemplateSpec } -func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMetadata *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { +func buildWorkerPodTemplate(primaryContainer *v1.Container, podSpec *v1.PodSpec, objectMetadata *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { // Some configs are copy from https://github.com/ray-project/kuberay/blob/b72e6bdcd9b8c77a9dc6b5da8560910f3a0c3ffd/apiserver/pkg/util/cluster.go#L185 // They should always be the same, so we could hard code here. initContainers := []v1.Container{ @@ -243,10 +313,11 @@ func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, object "-c", "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", }, - Resources: container.Resources, + Resources: primaryContainer.Resources, }, } - primaryContainer := container.DeepCopy() + podSpec.InitContainers = append(podSpec.InitContainers, initContainers...) + primaryContainer.Name = "ray-worker" primaryContainer.Args = []string{} @@ -342,12 +413,8 @@ func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, object } primaryContainer.Ports = append(primaryContainer.Ports, ports...) - workerPodSpec := podSpec.DeepCopy() - workerPodSpec.Containers = []v1.Container{*primaryContainer} - workerPodSpec.InitContainers = initContainers - podTemplateSpec := v1.PodTemplateSpec{ - Spec: *workerPodSpec, + Spec: *podSpec, ObjectMeta: *objectMetadata, } podTemplateSpec.SetLabels(utils.UnionMaps(podTemplateSpec.GetLabels(), utils.CopyMap(taskCtx.TaskExecutionMetadata().GetLabels()))) diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go index 6e3253776e..920fa85d61 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go @@ -5,7 +5,6 @@ import ( "testing" "time" - "github.com/golang/protobuf/jsonpb" structpb "github.com/golang/protobuf/ptypes/struct" rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" "github.com/stretchr/testify/assert" @@ -56,28 +55,36 @@ var ( workerGroupName = "worker-group" ) -func dummyRayCustomObj() *plugins.RayJob { - return &plugins.RayJob{ - RayCluster: &plugins.RayCluster{ - HeadGroupSpec: &plugins.HeadGroupSpec{RayStartParams: map[string]string{"num-cpus": "1"}}, - WorkerGroupSpec: []*plugins.WorkerGroupSpec{{GroupName: workerGroupName, Replicas: 3}}, - }, +func transformRayJobToCustomObj(rayJob *plugins.RayJob) *structpb.Struct { + structObj, err := utils.MarshalObjToStruct(rayJob) + if err != nil { + panic(err) } + return structObj } -func dummyRayTaskTemplate(id string, rayJobObj *plugins.RayJob) *core.TaskTemplate { - ptObjJSON, err := utils.MarshalToString(rayJobObj) +func transformPodSpecToTaskTemplateTarget(podSpec *corev1.PodSpec) *core.TaskTemplate_K8SPod { + structObj, err := utils.MarshalObjToStruct(&podSpec) if err != nil { panic(err) } + return &core.TaskTemplate_K8SPod{ + K8SPod: &core.K8SPod{ + PodSpec: structObj, + }, + } +} - structObj := structpb.Struct{} - - err = jsonpb.UnmarshalString(ptObjJSON, &structObj) - if err != nil { - panic(err) +func dummyRayCustomObj() *plugins.RayJob { + return &plugins.RayJob{ + RayCluster: &plugins.RayCluster{ + HeadGroupSpec: &plugins.HeadGroupSpec{RayStartParams: map[string]string{"num-cpus": "1"}}, + WorkerGroupSpec: []*plugins.WorkerGroupSpec{{GroupName: workerGroupName, Replicas: 3}}, + }, } +} +func dummyRayTaskTemplate(id string, rayJob *plugins.RayJob) *core.TaskTemplate { return &core.TaskTemplate{ Id: &core.Identifier{Name: id}, Type: "container", @@ -88,7 +95,7 @@ func dummyRayTaskTemplate(id string, rayJobObj *plugins.RayJob) *core.TaskTempla Env: dummyEnvVars, }, }, - Custom: &structObj, + Custom: transformRayJobToCustomObj(rayJob), } } @@ -198,7 +205,7 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { GpuResourceName: flytek8s.ResourceNvidiaGPU, })) - fixtures := []struct { + params := []struct { name string resources *corev1.ResourceRequirements extendedResourcesBase *core.ExtendedResources @@ -292,11 +299,11 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { }, } - for _, f := range fixtures { - t.Run(f.name, func(t *testing.T) { + for _, p := range params { + t.Run(p.name, func(t *testing.T) { taskTemplate := dummyRayTaskTemplate("ray-id", dummyRayCustomObj()) - taskTemplate.ExtendedResources = f.extendedResourcesBase - taskContext := dummyRayTaskContext(taskTemplate, f.resources, f.extendedResourcesOverride) + taskTemplate.ExtendedResources = p.extendedResourcesBase + taskContext := dummyRayTaskContext(taskTemplate, p.resources, p.extendedResourcesOverride) rayJobResourceHandler := rayJobResourceHandler{} r, err := rayJobResourceHandler.BuildResource(context.TODO(), taskContext) assert.Nil(t, err) @@ -308,12 +315,12 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { headNodeSpec := rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec assert.EqualValues( t, - f.expectedNsr, + p.expectedNsr, headNodeSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, ) assert.EqualValues( t, - f.expectedTol, + p.expectedTol, headNodeSpec.Tolerations, ) @@ -321,12 +328,12 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { workerNodeSpec := rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec assert.EqualValues( t, - f.expectedNsr, + p.expectedNsr, workerNodeSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, ) assert.EqualValues( t, - f.expectedTol, + p.expectedTol, workerNodeSpec.Tolerations, ) }) @@ -382,6 +389,219 @@ func TestDefaultStartParameters(t *testing.T) { assert.Equal(t, ray.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Tolerations, toleration) } +func TestInjectLogsSidecar(t *testing.T) { + rayJobObj := transformRayJobToCustomObj(dummyRayCustomObj()) + params := []struct { + name string + taskTemplate core.TaskTemplate + // primaryContainerName string + logsSidecarCfg *corev1.Container + expectedVolumes []corev1.Volume + expectedPrimaryContainerVolumeMounts []corev1.VolumeMount + expectedLogsSidecarVolumeMounts []corev1.VolumeMount + }{ + { + "container target", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: &core.TaskTemplate_Container{ + Container: &core.Container{ + Image: testImage, + Args: testArgs, + }, + }, + Custom: rayJobObj, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "system-ray-state", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + { + "container target with no sidecar", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: &core.TaskTemplate_Container{ + Container: &core.Container{ + Image: testImage, + Args: testArgs, + }, + }, + Custom: rayJobObj, + }, + nil, + nil, + nil, + nil, + }, + { + "pod target", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: transformPodSpecToTaskTemplateTarget(&corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "primary-image", + }, + }, + }), + Custom: rayJobObj, + Config: map[string]string{ + flytek8s.PrimaryContainerKey: "main", + }, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "system-ray-state", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + { + "pod target with existing ray state volume", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: transformPodSpecToTaskTemplateTarget(&corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "primary-image", + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test-vol", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + }), + Custom: rayJobObj, + Config: map[string]string{ + flytek8s.PrimaryContainerKey: "main", + }, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "test-vol", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + } + + for _, p := range params { + t.Run(p.name, func(t *testing.T) { + assert.NoError(t, SetConfig(&Config{ + LogsSidecar: p.logsSidecarCfg, + })) + taskContext := dummyRayTaskContext(&p.taskTemplate, resourceRequirements, nil) + rayJobResourceHandler := rayJobResourceHandler{} + r, err := rayJobResourceHandler.BuildResource(context.TODO(), taskContext) + assert.Nil(t, err) + assert.NotNil(t, r) + rayJob, ok := r.(*rayv1alpha1.RayJob) + assert.True(t, ok) + + headPodSpec := rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec + + // Check volumes + assert.EqualValues(t, p.expectedVolumes, headPodSpec.Volumes) + + // Check containers and respective volume mounts + foundPrimaryContainer := false + foundLogsSidecar := false + for _, cnt := range headPodSpec.Containers { + if cnt.Name == "ray-head" { + foundPrimaryContainer = true + assert.EqualValues( + t, + p.expectedPrimaryContainerVolumeMounts, + cnt.VolumeMounts, + ) + } + if p.logsSidecarCfg != nil && cnt.Name == p.logsSidecarCfg.Name { + foundLogsSidecar = true + assert.EqualValues( + t, + p.expectedLogsSidecarVolumeMounts, + cnt.VolumeMounts, + ) + } + } + assert.Equal(t, true, foundPrimaryContainer) + assert.Equal(t, p.logsSidecarCfg != nil, foundLogsSidecar) + }) + } +} + func newPluginContext() k8s.PluginContext { plg := &mocks2.PluginContext{} From b8cd12cd01c4f8faea961e67361e8eb8af810619 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Wed, 1 Nov 2023 09:26:13 -0500 Subject: [PATCH 17/43] Not revisiting task nodes and correctly incrementing parallelism (#4318) * adding fix for not revisiting task nodes and correctly incrementing parallelism Signed-off-by: Daniel Rammer * fixed unit tests Signed-off-by: Daniel Rammer --------- Signed-off-by: Daniel Rammer --- .../pkg/controller/nodes/task/handler.go | 6 ++++ .../pkg/controller/nodes/task/handler_test.go | 29 +++++++------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/flytepropeller/pkg/controller/nodes/task/handler.go b/flytepropeller/pkg/controller/nodes/task/handler.go index 6a0f44d648..64bf012b80 100644 --- a/flytepropeller/pkg/controller/nodes/task/handler.go +++ b/flytepropeller/pkg/controller/nodes/task/handler.go @@ -570,6 +570,12 @@ func (t Handler) Handle(ctx context.Context, nCtx interfaces.NodeExecutionContex } if pluginTrns.IsPreviouslyObserved() { logger.Debugf(ctx, "No state change for Task, previously observed same transition. Short circuiting.") + logger.Infof(ctx, "Parallelism now set to [%d].", nCtx.ExecutionContext().IncrementParallelism()) + + // This is a hack to ensure that we do not re-evaluate the same node again in the same round. + if err := nCtx.NodeStateWriter().PutTaskNodeState(ts); err != nil { + return handler.UnknownTransition, err + } return pluginTrns.FinalTransition(ctx) } } diff --git a/flytepropeller/pkg/controller/nodes/task/handler_test.go b/flytepropeller/pkg/controller/nodes/task/handler_test.go index 0e778a5e48..91c0879759 100644 --- a/flytepropeller/pkg/controller/nodes/task/handler_test.go +++ b/flytepropeller/pkg/controller/nodes/task/handler_test.go @@ -530,13 +530,12 @@ func Test_task_Handle_NoCatalog(t *testing.T) { expectedState fakeplugins.NextPhaseState } type want struct { - handlerPhase handler.EPhase - wantErr bool - event bool - eventPhase core.TaskExecution_Phase - skipStateUpdate bool - incrParallel bool - checkpoint bool + handlerPhase handler.EPhase + wantErr bool + event bool + eventPhase core.TaskExecution_Phase + incrParallel bool + checkpoint bool } tests := []struct { name string @@ -666,10 +665,9 @@ func Test_task_Handle_NoCatalog(t *testing.T) { }, }, want{ - handlerPhase: handler.EPhaseRunning, - event: false, - skipStateUpdate: true, - incrParallel: true, + handlerPhase: handler.EPhaseRunning, + event: false, + incrParallel: true, }, }, { @@ -738,13 +736,8 @@ func Test_task_Handle_NoCatalog(t *testing.T) { expectedPhase = pluginCore.PhasePermanentFailure } } - if tt.want.skipStateUpdate { - assert.Equal(t, pluginCore.PhaseUndefined, state.s.PluginPhase) - assert.Equal(t, uint32(0), state.s.PluginPhaseVersion) - } else { - assert.Equal(t, expectedPhase.String(), state.s.PluginPhase.String()) - assert.Equal(t, tt.args.expectedState.PhaseVersion, state.s.PluginPhaseVersion) - } + assert.Equal(t, expectedPhase.String(), state.s.PluginPhase.String()) + assert.Equal(t, tt.args.expectedState.PhaseVersion, state.s.PluginPhaseVersion) if tt.want.checkpoint { assert.Equal(t, "s3://sandbox/x/name-n1-1/_flytecheckpoints", got.Info().GetInfo().TaskNodeInfo.TaskNodeMetadata.CheckpointUri) From 617e4811dfc6fedd43c36b369ccfbe10083dc8e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20G=C3=B3mez?= Date: Wed, 1 Nov 2023 18:57:46 +0100 Subject: [PATCH 18/43] Remove duplicated ouputWriter mock (#4342) Signed-off-by: Andres Gomez Ferrer --- flyteplugins/tests/end_to_end.go | 1 - 1 file changed, 1 deletion(-) diff --git a/flyteplugins/tests/end_to_end.go b/flyteplugins/tests/end_to_end.go index 8fad552b57..603b4d3a30 100644 --- a/flyteplugins/tests/end_to_end.go +++ b/flyteplugins/tests/end_to_end.go @@ -82,7 +82,6 @@ func RunPluginEndToEndTest(t *testing.T, executor pluginCore.Plugin, template *i outputWriter.OnGetOutputPath().Return(basePrefix + "/outputs.pb") outputWriter.OnGetCheckpointPrefix().Return("/checkpoint") outputWriter.OnGetPreviousCheckpointsPrefix().Return("/prev") - outputWriter.OnPutMatch(mock.Anything, mock.Anything).Return(nil) outputWriter.OnPutMatch(mock.Anything, mock.Anything).Return(nil).Run(func(args mock.Arguments) { or := args.Get(1).(io.OutputReader) From 5b32a42746d666077df80b0c2266cf287e964e12 Mon Sep 17 00:00:00 2001 From: Jeev B Date: Wed, 1 Nov 2023 12:36:55 -0700 Subject: [PATCH 19/43] Tune sandbox readiness checks to ensure that sandbox is fully accessible when marked as ready (#4348) Signed-off-by: Jeev B --- charts/flyte-binary/templates/deployment.yaml | 1 + .../flyte-sandbox/templates/proxy/deployment.yaml | 7 +++++++ .../flyte_sandbox_binary_helm_generated.yaml | 1 + .../sandbox-bundled/manifests/complete-agent.yaml | 14 +++++++++++--- docker/sandbox-bundled/manifests/complete.yaml | 12 ++++++++++-- docker/sandbox-bundled/manifests/dev.yaml | 11 +++++++++-- 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/charts/flyte-binary/templates/deployment.yaml b/charts/flyte-binary/templates/deployment.yaml index cc8beff2d9..a201a4a300 100644 --- a/charts/flyte-binary/templates/deployment.yaml +++ b/charts/flyte-binary/templates/deployment.yaml @@ -193,6 +193,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 {{- end }} readinessProbe: {{- if .Values.deployment.readinessProbe }} diff --git a/charts/flyte-sandbox/templates/proxy/deployment.yaml b/charts/flyte-sandbox/templates/proxy/deployment.yaml index 9866466f20..74c6db10e8 100644 --- a/charts/flyte-sandbox/templates/proxy/deployment.yaml +++ b/charts/flyte-sandbox/templates/proxy/deployment.yaml @@ -25,6 +25,13 @@ spec: volumeMounts: - name: config mountPath: /etc/envoy + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 30 + readinessProbe: + tcpSocket: + port: http volumes: - name: config configMap: diff --git a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml index a72612995d..1022ed1adc 100644 --- a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml +++ b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml @@ -417,6 +417,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 readinessProbe: httpGet: path: /healthcheck diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index d9d94e5023..2eadbecf80 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: SWRqSVZmendLdVJvamVGRQ== + haSharedSecret: UzFydXlGcGwyM3FJYWpTdw== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1273,6 +1273,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 name: flyte ports: - containerPort: 8088 @@ -1409,7 +1410,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 4e6248034c039b0bc65cf110d26fccc9d88b8a2d49855a0ecba0443f972ca3c3 + checksum/secret: 12acfc99559b7ddf909a9c66f3645153d66e96bc8dd1a7c1e2ff5af865d51827 labels: app: docker-registry release: flyte-sandbox @@ -1697,10 +1698,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config @@ -1744,7 +1752,7 @@ spec: value: minio - name: FLYTE_AWS_SECRET_ACCESS_KEY value: miniostorage - image: ghcr.io/flyteorg/flyteagent:1.9.1 + image: ghcr.io/flyteorg/flyteagent:1.10.0 imagePullPolicy: IfNotPresent name: flyteagent ports: diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index 998f71d0a0..06f86532c6 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -805,7 +805,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: UUREcXo3a1VGNnlyc1RCWg== + haSharedSecret: Y2dKT2xJeEhXaWUyZUcwMg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1230,6 +1230,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 name: flyte ports: - containerPort: 8088 @@ -1366,7 +1367,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: cfe2089fc583f69d068c3b3d56e875082a5d926c70b00b32f094d587df7396a5 + checksum/secret: 9a48d62cb53f25eaefca010e49af275334df350c757deb72a31fca6dcb2cdb95 labels: app: docker-registry release: flyte-sandbox @@ -1654,10 +1655,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index f88f94fcd3..eda6c8c4ca 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: VjRtTTRpSW94OU9rUVFZTQ== + haSharedSecret: OFd1Q2o4V28zVk1lV0lOcA== proxyPassword: "" proxyUsername: "" kind: Secret @@ -933,7 +933,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 8b25114d47a69e875e3841dace0b87abfda7a16f6833a5cf76b4570092f3a1cd + checksum/secret: 83de0933db4333daa08a690fbbf752532361f657be692a243b79daa7dc1fef1c labels: app: docker-registry release: flyte-sandbox @@ -1221,10 +1221,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config From 5301da09de12c26008b321ad768680e33d014d96 Mon Sep 17 00:00:00 2001 From: Brandon Black Date: Thu, 2 Nov 2023 11:07:19 -0700 Subject: [PATCH 20/43] Chore: Ensure Stalebot doesn't close issues we've not yet triaged. (#4352) * chore(stale.yml): Update exempt-issue-labels to include "untriaged" label Signed-off-by: Brandon Black * Update .github/workflows/stale.yml Co-authored-by: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Signed-off-by: Brandon Black --------- Signed-off-by: Brandon Black Co-authored-by: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> --- .github/workflows/stale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c5e209e044..143c9aa152 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -15,7 +15,7 @@ jobs: stale-issue-message: "Hello 👋, This issue has been inactive for over 9 months. To help maintain a clean and focused backlog, we'll be marking this issue as stale and will close the issue if we detect no activity in the next 7 days. Thank you for your contribution and understanding! 🙏" close-issue-message: "Hello 👋, This issue has been inactive for over 9 months and hasn't received any updates since it was marked as stale. We'll be closing this issue for now, but if you believe this issue is still relevant, please feel free to reopen it. Thank you for your contribution and understanding! 🙏" stale-issue-label: "stale" - exempt-issue-labels: "needs discussion" # Comma-separated list of labels. + exempt-issue-labels: "needs discussion,untriaged" # Comma-separated list of labels. days-before-stale: 270 days-before-close: 7 ascending: true # https://github.com/actions/stale#ascending From a1d182b3690555b784764a59e1268f55b2a083b8 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Thu, 2 Nov 2023 12:03:05 -0700 Subject: [PATCH 21/43] Do not automatically close stale issues (#4353) * Do not automatically close stale issues Signed-off-by: Eduardo Apolinario * Rephrase message Signed-off-by: Eduardo Apolinario --------- Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- .github/workflows/stale.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 143c9aa152..428d42b5ed 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -12,11 +12,14 @@ jobs: - uses: actions/stale@v8 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-issue-message: "Hello 👋, This issue has been inactive for over 9 months. To help maintain a clean and focused backlog, we'll be marking this issue as stale and will close the issue if we detect no activity in the next 7 days. Thank you for your contribution and understanding! 🙏" - close-issue-message: "Hello 👋, This issue has been inactive for over 9 months and hasn't received any updates since it was marked as stale. We'll be closing this issue for now, but if you believe this issue is still relevant, please feel free to reopen it. Thank you for your contribution and understanding! 🙏" + stale-issue-message: > + Hello 👋, this issue has been inactive for over 9 months. To help maintain a clean and focused backlog, + we'll be marking this issue as stale and will engage on it to decide if it is still applicable. + + Thank you for your contribution and understanding! 🙏 stale-issue-label: "stale" exempt-issue-labels: "needs discussion,untriaged" # Comma-separated list of labels. days-before-stale: 270 - days-before-close: 7 + days-before-close: -1 ascending: true # https://github.com/actions/stale#ascending operations-per-run: 500 From 6425a2ad20da8b5e847598d7c68b561ec34cde1f Mon Sep 17 00:00:00 2001 From: "Fabio M. Graetz, Ph.D" Date: Fri, 3 Nov 2023 20:21:13 +0100 Subject: [PATCH 22/43] Fix: Set flyteadmin gRPC port to 80 in ingress if using TLS between load balancer and backend (#3964) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Set admin grpc port to 80 in ingress if using TLS between load balancer and backend Signed-off-by: Fabio Grätz * Run 'make helm' Signed-off-by: Fabio Grätz --------- Signed-off-by: Fabio Grätz Co-authored-by: Fabio Grätz --- .../flyte-core/templates/common/ingress.yaml | 52 ++++++++++--------- .../flyte_aws_scheduler_helm_generated.yaml | 1 + .../flyte_helm_controlplane_generated.yaml | 1 + .../eks/flyte_helm_dataplane_generated.yaml | 1 + deployment/eks/flyte_helm_generated.yaml | 1 + .../flyte_helm_controlplane_generated.yaml | 1 + .../gcp/flyte_helm_dataplane_generated.yaml | 1 + deployment/gcp/flyte_helm_generated.yaml | 1 + deployment/sandbox/flyte_helm_generated.yaml | 1 + 9 files changed, 36 insertions(+), 24 deletions(-) diff --git a/charts/flyte-core/templates/common/ingress.yaml b/charts/flyte-core/templates/common/ingress.yaml index 19fd66563e..2a45152a0b 100644 --- a/charts/flyte-core/templates/common/ingress.yaml +++ b/charts/flyte-core/templates/common/ingress.yaml @@ -1,4 +1,8 @@ {{- define "grpcRoutes" -}} +{{- $grpcPort := 81 -}} +{{- if eq .Values.configmap.adminServer.server.security.secure true -}} + {{- $grpcPort = 80 -}} +{{- end }} # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific @@ -7,10 +11,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.SignalService/* pathType: ImplementationSpecific @@ -19,10 +23,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AdminService pathType: ImplementationSpecific @@ -31,10 +35,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AdminService/* pathType: ImplementationSpecific @@ -43,10 +47,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.DataProxyService pathType: ImplementationSpecific @@ -55,10 +59,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.DataProxyService/* pathType: ImplementationSpecific @@ -67,10 +71,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AuthMetadataService pathType: ImplementationSpecific @@ -79,10 +83,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AuthMetadataService/* pathType: ImplementationSpecific @@ -91,10 +95,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.IdentityService pathType: ImplementationSpecific @@ -103,10 +107,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.IdentityService/* pathType: ImplementationSpecific @@ -115,10 +119,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /grpc.health.v1.Health pathType: ImplementationSpecific @@ -127,10 +131,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /grpc.health.v1.Health/* pathType: ImplementationSpecific @@ -139,10 +143,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} {{- end }} {{- if .Values.common.ingress.enabled }} diff --git a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml index 4ac504ff6f..00af22b862 100644 --- a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml +++ b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml @@ -1570,6 +1570,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_controlplane_generated.yaml b/deployment/eks/flyte_helm_controlplane_generated.yaml index 78bc58c88e..a4ea548579 100644 --- a/deployment/eks/flyte_helm_controlplane_generated.yaml +++ b/deployment/eks/flyte_helm_controlplane_generated.yaml @@ -1198,6 +1198,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_dataplane_generated.yaml b/deployment/eks/flyte_helm_dataplane_generated.yaml index 4fc4767953..6746cc72fd 100644 --- a/deployment/eks/flyte_helm_dataplane_generated.yaml +++ b/deployment/eks/flyte_helm_dataplane_generated.yaml @@ -771,6 +771,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_generated.yaml b/deployment/eks/flyte_helm_generated.yaml index ba14d17673..32399dd395 100644 --- a/deployment/eks/flyte_helm_generated.yaml +++ b/deployment/eks/flyte_helm_generated.yaml @@ -1689,6 +1689,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_controlplane_generated.yaml b/deployment/gcp/flyte_helm_controlplane_generated.yaml index 66ce16f784..fc63915736 100644 --- a/deployment/gcp/flyte_helm_controlplane_generated.yaml +++ b/deployment/gcp/flyte_helm_controlplane_generated.yaml @@ -1192,6 +1192,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_dataplane_generated.yaml b/deployment/gcp/flyte_helm_dataplane_generated.yaml index 458d10a150..6b5220ffcd 100644 --- a/deployment/gcp/flyte_helm_dataplane_generated.yaml +++ b/deployment/gcp/flyte_helm_dataplane_generated.yaml @@ -757,6 +757,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_generated.yaml b/deployment/gcp/flyte_helm_generated.yaml index 2144bb6ee5..8b0378d749 100644 --- a/deployment/gcp/flyte_helm_generated.yaml +++ b/deployment/gcp/flyte_helm_generated.yaml @@ -1690,6 +1690,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/sandbox/flyte_helm_generated.yaml b/deployment/sandbox/flyte_helm_generated.yaml index 82808bf384..7086a756ef 100644 --- a/deployment/sandbox/flyte_helm_generated.yaml +++ b/deployment/sandbox/flyte_helm_generated.yaml @@ -7701,6 +7701,7 @@ spec: name: flyteadmin port: number: 80 + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific From 9fe34dbace17037f860c736ba1b8f709ee6cf773 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 6 Nov 2023 03:53:46 +0800 Subject: [PATCH 23/43] Support Databricks WebAPI 2.1 version and Support `existing_cluster_id` and `new_cluster` options to create a Job (#4361) * Support Databricks Agent Api 2.1 version and Support existing_cluster_id and new_cluster options to create a Job Signed-off-by: Future Outlier * fix delete post data bug, add PENDING message info Signed-off-by: Future Outlier --------- Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- .../webapi/databricks/integration_test.go | 30 ++++++++++++++++++- .../tasks/plugins/webapi/databricks/plugin.go | 24 ++++++++++----- .../plugins/webapi/databricks/plugin_test.go | 2 +- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go b/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go index 6a933c6e4f..651892f672 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go @@ -44,7 +44,7 @@ func TestEndToEnd(t *testing.T) { plugin, err := pluginEntry.LoadPlugin(context.TODO(), newFakeSetupContext()) assert.NoError(t, err) - t.Run("run a databricks job", func(t *testing.T) { + t.Run("run a databricks job by new_cluster key", func(t *testing.T) { databricksConfDict := map[string]interface{}{ "name": "flytekit databricks plugin example", "new_cluster": map[string]string{ @@ -75,6 +75,34 @@ func TestEndToEnd(t *testing.T) { phase := tests.RunPluginEndToEndTest(t, plugin, &template, inputs, nil, nil, iter) assert.Equal(t, true, phase.Phase().IsSuccess()) }) + + t.Run("run a databricks job by new_cluster key", func(t *testing.T) { + databricksConfDict := map[string]interface{}{ + "name": "flytekit databricks plugin example", + "existing_cluster_id": "1201-my-cluster", + "timeout_seconds": 3600, + "max_retries": 1, + } + databricksConfig, err := utils.MarshalObjToStruct(databricksConfDict) + assert.NoError(t, err) + sparkJob := plugins.SparkJob{DatabricksConf: databricksConfig, DatabricksToken: "token", SparkConf: map[string]string{"spark.driver.bindAddress": "127.0.0.1"}} + st, err := utils.MarshalPbToStruct(&sparkJob) + assert.NoError(t, err) + inputs, _ := coreutils.MakeLiteralMap(map[string]interface{}{"x": 1}) + template := flyteIdlCore.TaskTemplate{ + Type: "databricks", + Custom: st, + Target: &coreIdl.TaskTemplate_Container{ + Container: &coreIdl.Container{ + Command: []string{"command"}, + Args: []string{"pyflyte-execute"}, + }, + }, + } + + phase := tests.RunPluginEndToEndTest(t, plugin, &template, inputs, nil, nil, iter) + assert.Equal(t, true, phase.Phase().IsSuccess()) + }) } func newFakeDatabricksServer() *httptest.Server { diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go index 1ce0d747f4..3bd03135dc 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go @@ -29,7 +29,7 @@ const ( ErrSystem errors.ErrorCode = "System" post string = "POST" get string = "GET" - databricksAPI string = "/api/2.0/jobs/runs" + databricksAPI string = "/api/2.1/jobs/runs" newCluster string = "new_cluster" dockerImage string = "docker_image" sparkConfig string = "spark_conf" @@ -114,10 +114,15 @@ func (p Plugin) Create(ctx context.Context, taskCtx webapi.TaskExecutionContextR return nil, nil, fmt.Errorf("failed to unmarshal databricksJob: %v: %v", sparkJob.DatabricksConf, err) } - if _, ok := databricksJob[newCluster]; ok { - databricksJob[newCluster].(map[string]interface{})[dockerImage] = map[string]string{url: container.Image} - if len(sparkJob.SparkConf) != 0 { - databricksJob[newCluster].(map[string]interface{})[sparkConfig] = sparkJob.SparkConf + // If "existing_cluster_id" is in databricks_job, then we don't need to set "new_cluster" + // Refer the docs here: https://docs.databricks.com/en/workflows/jobs/jobs-2.0-api.html#request-structure + if clusterConfig, ok := databricksJob[newCluster].(map[string]interface{}); ok { + if dockerConfig, ok := clusterConfig[dockerImage].(map[string]interface{}); !ok || dockerConfig[url] == nil { + clusterConfig[dockerImage] = map[string]string{url: container.Image} + } + + if clusterConfig[sparkConfig] == nil && len(sparkJob.SparkConf) != 0 { + clusterConfig[sparkConfig] = sparkJob.SparkConf } } databricksJob[sparkPythonTask] = map[string]interface{}{pythonFile: p.cfg.EntrypointFile, parameters: modifiedArgs} @@ -222,7 +227,7 @@ func (p Plugin) Status(ctx context.Context, taskCtx webapi.StatusContext) (phase case http.StatusAccepted: return core.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, taskInfo), nil case http.StatusOK: - if lifeCycleState == "TERMINATED" || lifeCycleState == "SKIPPED" || lifeCycleState == "INTERNAL_ERROR" { + if lifeCycleState == "TERMINATED" || lifeCycleState == "TERMINATING" || lifeCycleState == "INTERNAL_ERROR" { if resultState == "SUCCESS" { if err := writeOutput(ctx, taskCtx); err != nil { pluginsCore.PhaseInfoFailure(string(rune(statusCode)), "failed to write output", taskInfo) @@ -231,6 +236,11 @@ func (p Plugin) Status(ctx context.Context, taskCtx webapi.StatusContext) (phase } return pluginsCore.PhaseInfoFailure(string(rune(statusCode)), message, taskInfo), nil } + + if lifeCycleState == "PENDING" { + return core.PhaseInfoInitializing(time.Now(), core.DefaultPhaseVersion, message, taskInfo), nil + } + return core.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, taskInfo), nil case http.StatusBadRequest: fallthrough @@ -278,7 +288,7 @@ func buildRequest( var err error if isCancel { databricksURL += "/cancel" - data = []byte(fmt.Sprintf("{ run_id: %v }", runID)) + data = []byte(fmt.Sprintf("{ \"run_id\": %v }", runID)) } else if method == post { databricksURL += "/submit" mJSON, err := json.Marshal(databricksJob) diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go index 1bafbba223..fda3ab61b0 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go @@ -67,7 +67,7 @@ func TestBuildRequest(t *testing.T) { token := "test-token" runID := "019e70eb" databricksEndpoint := "" - databricksURL := "https://" + testInstance + "/api/2.0/jobs/runs" + databricksURL := "https://" + testInstance + "/api/2.1/jobs/runs" t.Run("build http request for submitting a databricks job", func(t *testing.T) { req, err := buildRequest(post, nil, databricksEndpoint, testInstance, token, runID, false) header := http.Header{} From eccf993c4ae462cc0fc475cd0083d0b093d4bc10 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Mon, 6 Nov 2023 08:45:32 -0600 Subject: [PATCH 24/43] Fixing caching on maptasks when using partials (#4344) * adding non-collection literals to statis input readers for cache key computation Signed-off-by: Daniel Rammer * fixed codespell and added unit tests Signed-off-by: Daniel Rammer --------- Signed-off-by: Daniel Rammer --- .../go/tasks/plugins/array/catalog.go | 42 ++--- .../pkg/controller/nodes/array/handler.go | 7 +- .../nodes/array/node_execution_context.go | 13 +- .../array/node_execution_context_test.go | 160 ++++++++++++++++++ 4 files changed, 188 insertions(+), 34 deletions(-) create mode 100644 flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go diff --git a/flyteplugins/go/tasks/plugins/array/catalog.go b/flyteplugins/go/tasks/plugins/array/catalog.go index 8544555609..836ff52fda 100644 --- a/flyteplugins/go/tasks/plugins/array/catalog.go +++ b/flyteplugins/go/tasks/plugins/array/catalog.go @@ -74,11 +74,10 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex return state, errors.Errorf(errors.MetadataAccessFailed, "Could not read inputs and therefore failed to determine array job size") } + // identify and validate the size of the array job size := -1 var literalCollection *idlCore.LiteralCollection - literals := make([][]*idlCore.Literal, 0) - discoveredInputNames := make([]string, 0) - for inputName, literal := range inputs.Literals { + for _, literal := range inputs.Literals { if literalCollection = literal.GetCollection(); literalCollection != nil { // validate length of input list if size != -1 && size != len(literalCollection.Literals) { @@ -86,9 +85,6 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex return state, nil } - literals = append(literals, literalCollection.Literals) - discoveredInputNames = append(discoveredInputNames, inputName) - size = len(literalCollection.Literals) } } @@ -110,7 +106,7 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex arrayJobSize = int64(size) // build input readers - inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), literals, discoveredInputNames) + inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), inputs.Literals, size) } if arrayJobSize > maxArrayJobSize { @@ -246,18 +242,7 @@ func WriteToDiscovery(ctx context.Context, tCtx core.TaskExecutionContext, state return state, externalResources, errors.Errorf(errors.MetadataAccessFailed, "Could not read inputs and therefore failed to determine array job size") } - var literalCollection *idlCore.LiteralCollection - literals := make([][]*idlCore.Literal, 0) - discoveredInputNames := make([]string, 0) - for inputName, literal := range inputs.Literals { - if literalCollection = literal.GetCollection(); literalCollection != nil { - literals = append(literals, literalCollection.Literals) - discoveredInputNames = append(discoveredInputNames, inputName) - } - } - - // build input readers - inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), literals, discoveredInputNames) + inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), inputs.Literals, arrayJobSize) } // output reader @@ -476,16 +461,19 @@ func ConstructCatalogReaderWorkItems(ctx context.Context, taskReader core.TaskRe // ConstructStaticInputReaders constructs input readers that comply with the io.InputReader interface but have their // inputs already populated. -func ConstructStaticInputReaders(inputPaths io.InputFilePaths, inputs [][]*idlCore.Literal, inputNames []string) []io.InputReader { - inputReaders := make([]io.InputReader, 0, len(inputs)) - if len(inputs) == 0 { - return inputReaders - } +func ConstructStaticInputReaders(inputPaths io.InputFilePaths, inputLiterals map[string]*idlCore.Literal, arrayJobSize int) []io.InputReader { + var literalCollection *idlCore.LiteralCollection - for i := 0; i < len(inputs[0]); i++ { + inputReaders := make([]io.InputReader, 0, arrayJobSize) + for i := 0; i < arrayJobSize; i++ { literals := make(map[string]*idlCore.Literal) - for j := 0; j < len(inputNames); j++ { - literals[inputNames[j]] = inputs[j][i] + for inputName, inputLiteral := range inputLiterals { + if literalCollection = inputLiteral.GetCollection(); literalCollection != nil { + // if literal is a collection then we need to retrieve the specific literal for this subtask index + literals[inputName] = literalCollection.Literals[i] + } else { + literals[inputName] = inputLiteral + } } inputReaders = append(inputReaders, NewStaticInputReader(inputPaths, &idlCore.LiteralMap{Literals: literals})) diff --git a/flytepropeller/pkg/controller/nodes/array/handler.go b/flytepropeller/pkg/controller/nodes/array/handler.go index 35076d0a97..dddcd0e7c5 100644 --- a/flytepropeller/pkg/controller/nodes/array/handler.go +++ b/flytepropeller/pkg/controller/nodes/array/handler.go @@ -494,7 +494,12 @@ func (a *arrayNodeHandler) buildArrayNodeContext(ctx context.Context, nCtx inter taskPhase := int(arrayNodeState.SubNodeTaskPhases.GetItem(subNodeIndex)) // need to initialize the inputReader every time to ensure TaskHandler can access for cache lookups / population - inputLiteralMap, err := constructLiteralMap(ctx, nCtx.InputReader(), subNodeIndex) + inputs, err := nCtx.InputReader().Get(ctx) + if err != nil { + return nil, nil, nil, nil, nil, nil, err + } + + inputLiteralMap, err := constructLiteralMap(inputs, subNodeIndex) if err != nil { return nil, nil, nil, nil, nil, nil, err } diff --git a/flytepropeller/pkg/controller/nodes/array/node_execution_context.go b/flytepropeller/pkg/controller/nodes/array/node_execution_context.go index c639c543e7..17d46d2944 100644 --- a/flytepropeller/pkg/controller/nodes/array/node_execution_context.go +++ b/flytepropeller/pkg/controller/nodes/array/node_execution_context.go @@ -2,6 +2,7 @@ package array import ( "context" + "fmt" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/io" @@ -26,16 +27,16 @@ func newStaticInputReader(inputPaths io.InputFilePaths, input *core.LiteralMap) } } -func constructLiteralMap(ctx context.Context, inputReader io.InputReader, index int) (*core.LiteralMap, error) { - inputs, err := inputReader.Get(ctx) - if err != nil { - return nil, err - } - +func constructLiteralMap(inputs *core.LiteralMap, index int) (*core.LiteralMap, error) { literals := make(map[string]*core.Literal) for name, literal := range inputs.Literals { if literalCollection := literal.GetCollection(); literalCollection != nil { + if index >= len(literalCollection.Literals) { + return nil, fmt.Errorf("index %v out of bounds for literal collection %v", index, name) + } literals[name] = literalCollection.Literals[index] + } else { + literals[name] = literal } } diff --git a/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go b/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go new file mode 100644 index 0000000000..0a5546bd25 --- /dev/null +++ b/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go @@ -0,0 +1,160 @@ +package array + +import ( + "testing" + + "github.com/golang/protobuf/proto" + "github.com/stretchr/testify/assert" + + "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" +) + +var ( + literalOne = &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_Primitive{ + Primitive: &core.Primitive{ + Value: &core.Primitive_Integer{ + Integer: 1, + }, + }, + }, + }, + }, + } + literalTwo = &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_Primitive{ + Primitive: &core.Primitive{ + Value: &core.Primitive_Integer{ + Integer: 2, + }, + }, + }, + }, + }, + } +) + +func TestConstructLiteralMap(t *testing.T) { + tests := []struct { + name string + inputLiteralMaps *core.LiteralMap + expectedLiteralMaps []*core.LiteralMap + }{ + { + "SingleList", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + }, + }, + }, + }, + { + "MultiList", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + "bar": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalTwo, + literalOne, + }, + }, + }, + }, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + "bar": literalTwo, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + "bar": literalOne, + }, + }, + }, + }, + { + "Partial", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + "bar": literalTwo, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + "bar": literalTwo, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + "bar": literalTwo, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for i := 0; i < len(test.expectedLiteralMaps); i++ { + outputLiteralMap, err := constructLiteralMap(test.inputLiteralMaps, i) + assert.NoError(t, err) + assert.True(t, proto.Equal(test.expectedLiteralMaps[i], outputLiteralMap)) + } + }) + } +} From 939eb8b898379a7c699f217f9ccbfa46d0164089 Mon Sep 17 00:00:00 2001 From: Honnix Date: Mon, 6 Nov 2023 20:20:38 +0100 Subject: [PATCH 25/43] Fix read raw limit (#4370) Signed-off-by: Hongxin Liang --- flytestdlib/storage/stow_store.go | 4 ++-- flytestdlib/storage/stow_store_test.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flytestdlib/storage/stow_store.go b/flytestdlib/storage/stow_store.go index ddda2ce762..701604d437 100644 --- a/flytestdlib/storage/stow_store.go +++ b/flytestdlib/storage/stow_store.go @@ -265,8 +265,8 @@ func (s *StowStore) ReadRaw(ctx context.Context, reference DataReference) (io.Re } if GetConfig().Limits.GetLimitMegabytes != 0 { - if sizeMbs := sizeBytes / MiB; sizeMbs > GetConfig().Limits.GetLimitMegabytes { - return nil, errors.Errorf(ErrExceedsLimit, "limit exceeded. %vmb > %vmb.", sizeMbs, GetConfig().Limits.GetLimitMegabytes) + if sizeBytes > GetConfig().Limits.GetLimitMegabytes*MiB { + return nil, errors.Errorf(ErrExceedsLimit, "limit exceeded. %.6fmb > %vmb.", float64(sizeBytes)/float64(MiB), GetConfig().Limits.GetLimitMegabytes) } } diff --git a/flytestdlib/storage/stow_store_test.go b/flytestdlib/storage/stow_store_test.go index 367145ac93..d888977c57 100644 --- a/flytestdlib/storage/stow_store_test.go +++ b/flytestdlib/storage/stow_store_test.go @@ -274,7 +274,7 @@ func TestStowStore_ReadRaw(t *testing.T) { }, }, nil, false, metrics) assert.NoError(t, err) - dataReference := writeTestFileWithSize(ctx, t, s, "s3://container/path", 3*MiB) + dataReference := writeTestFileWithSize(ctx, t, s, "s3://container/path", 2*MiB+1) _, err = s.ReadRaw(ctx, dataReference) assert.Error(t, err) assert.True(t, IsExceedsLimit(err)) From ded4a2ec303c160aa406c5520ce6615cfe1e347e Mon Sep 17 00:00:00 2001 From: Guy Arad Date: Mon, 6 Nov 2023 22:47:16 +0200 Subject: [PATCH 26/43] minor fix to eks-starter.yaml (#4337) fixes deployment template readiness and liveness probes with correct port Signed-off-by: Guy Arad --- charts/flyte-binary/eks-starter.yaml | 3 ++- charts/flyte-binary/templates/deployment.yaml | 1 + .../flyte_sandbox_binary_helm_generated.yaml | 1 + script/generate_helm.sh | 7 ++++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/charts/flyte-binary/eks-starter.yaml b/charts/flyte-binary/eks-starter.yaml index 1ffe5146c6..c7ca135340 100644 --- a/charts/flyte-binary/eks-starter.yaml +++ b/charts/flyte-binary/eks-starter.yaml @@ -1,8 +1,9 @@ configuration: database: + username: password: host: - dbname: app # what is this? why not flyteadmin or flyte? + dbname: flyteadmin () storage: metadataContainer: userDataContainer: diff --git a/charts/flyte-binary/templates/deployment.yaml b/charts/flyte-binary/templates/deployment.yaml index a201a4a300..d1be13d166 100644 --- a/charts/flyte-binary/templates/deployment.yaml +++ b/charts/flyte-binary/templates/deployment.yaml @@ -202,6 +202,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 {{- end }} {{- if .Values.deployment.resources }} resources: {{- toYaml .Values.deployment.resources | nindent 12 }} diff --git a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml index 1022ed1adc..6be6f29c35 100644 --- a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml +++ b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml @@ -422,6 +422,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 volumeMounts: - name: cluster-resource-templates mountPath: /etc/flyte/cluster-resource-templates diff --git a/script/generate_helm.sh b/script/generate_helm.sh index 183b22debc..43caee8203 100755 --- a/script/generate_helm.sh +++ b/script/generate_helm.sh @@ -4,9 +4,14 @@ set -ex echo "Generating Helm" -curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +HELM_SKIP_INSTALL=${HELM_SKIP_INSTALL:-false} + +if [ "${HELM_SKIP_INSTALL}" != "true" ]; then + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +fi helm version + # All the values files to be built DEPLOYMENT_CORE=${1:-eks gcp} From 8824341905ed7f306c9eb96a77c19ae4a42792d8 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 7 Nov 2023 09:09:01 -0600 Subject: [PATCH 27/43] Reporting running if the primary container status is not yet reported (#4339) * reporting running if the primary container status is not yet reported Signed-off-by: Daniel Rammer * removed dead code Signed-off-by: Daniel Rammer --------- Signed-off-by: Daniel Rammer --- .../pluginmachinery/flytek8s/pod_helper.go | 3 +- .../flytek8s/pod_helper_test.go | 1 + .../go/tasks/plugins/k8s/pod/plugin.go | 12 ++++++- .../go/tasks/plugins/k8s/pod/sidecar_test.go | 34 +++++++++++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go index 8d33305806..c326a3ddd1 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -24,6 +24,7 @@ import ( const PodKind = "pod" const OOMKilled = "OOMKilled" const Interrupted = "Interrupted" +const PrimaryContainerNotFound = "PrimaryContainerNotFound" const SIGKILL = 137 const defaultContainerTemplateName = "default" @@ -746,7 +747,7 @@ func DeterminePrimaryContainerPhase(primaryContainerName string, statuses []v1.C } // If for some reason we can't find the primary container, always just return a permanent failure - return pluginsCore.PhaseInfoFailure("PrimaryContainerMissing", + return pluginsCore.PhaseInfoFailure(PrimaryContainerNotFound, fmt.Sprintf("Primary container [%s] not found in pod's container statuses", primaryContainerName), info) } diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go index 0c3f8d90f2..ec2f8f89da 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go @@ -1682,6 +1682,7 @@ func TestDeterminePrimaryContainerPhase(t *testing.T) { secondaryContainer, }, info) assert.Equal(t, pluginsCore.PhasePermanentFailure, phaseInfo.Phase()) + assert.Equal(t, PrimaryContainerNotFound, phaseInfo.Err().Code) assert.Equal(t, "Primary container [primary] not found in pod's container statuses", phaseInfo.Err().Message) }) } diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go index d21eefb8b9..d1ba98bcaa 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go @@ -211,7 +211,17 @@ func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.Plugin } else { // if the primary container annotation exists, we use the status of the specified container phaseInfo = flytek8s.DeterminePrimaryContainerPhase(primaryContainerName, pod.Status.ContainerStatuses, &info) - if phaseInfo.Phase() == pluginsCore.PhaseRunning && len(info.Logs) > 0 { + if phaseInfo.Phase() == pluginsCore.PhasePermanentFailure && phaseInfo.Err() != nil && + phaseInfo.Err().GetCode() == flytek8s.PrimaryContainerNotFound { + // if the primary container status is not found ensure that the primary container exists. + // note: it should be impossible for the primary container to not exist at this point. + for _, container := range pod.Spec.Containers { + if container.Name == primaryContainerName { + phaseInfo = pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, &info) + break + } + } + } else if phaseInfo.Phase() == pluginsCore.PhaseRunning && len(info.Logs) > 0 { phaseInfo = phaseInfo.WithVersion(pluginsCore.DefaultPhaseVersion + 1) } } diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go b/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go index 48a000de17..0c728780d9 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go @@ -844,6 +844,13 @@ func TestDemystifiedSidecarStatus_PrimaryRunning(t *testing.T) { func TestDemystifiedSidecarStatus_PrimaryMissing(t *testing.T) { res := &v1.Pod{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Secondary", + }, + }, + }, Status: v1.PodStatus{ Phase: v1.PodRunning, ContainerStatuses: []v1.ContainerStatus{ @@ -862,6 +869,33 @@ func TestDemystifiedSidecarStatus_PrimaryMissing(t *testing.T) { assert.Equal(t, pluginsCore.PhasePermanentFailure, phaseInfo.Phase()) } +func TestDemystifiedSidecarStatus_PrimaryNotExistsYet(t *testing.T) { + res := &v1.Pod{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Primary", + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "Secondary", + }, + }, + }, + } + res.SetAnnotations(map[string]string{ + flytek8s.PrimaryContainerKey: "Primary", + }) + taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements, nil) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) + assert.Nil(t, err) + assert.Equal(t, pluginsCore.PhaseRunning, phaseInfo.Phase()) +} + func TestGetProperties(t *testing.T) { expected := k8s.PluginProperties{} assert.Equal(t, expected, DefaultPodPlugin.GetProperties()) From e794063072edb81f9939562cb1f00aeca664d446 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 7 Nov 2023 09:09:24 -0600 Subject: [PATCH 28/43] completing retries even if minSuccesses are achieved (#4338) * completing retries even if minSuccesses are achieved Signed-off-by: Daniel Rammer * ensuring all tasks in a terminal state Signed-off-by: Daniel Rammer * updated dead code to comment Signed-off-by: Daniel Rammer --------- Signed-off-by: Daniel Rammer --- .../go/tasks/plugins/array/core/state.go | 5 ++++- .../go/tasks/plugins/array/core/state_test.go | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/flyteplugins/go/tasks/plugins/array/core/state.go b/flyteplugins/go/tasks/plugins/array/core/state.go index 665eec309f..f601ec375b 100644 --- a/flyteplugins/go/tasks/plugins/array/core/state.go +++ b/flyteplugins/go/tasks/plugins/array/core/state.go @@ -273,7 +273,10 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) return PhaseWaitingForResources } - if totalSuccesses >= minSuccesses && totalRunning == 0 { + + // if we have enough successes, ensure all tasks are in a terminal phase (success or failure) + // before transitioning to the next phase. + if totalSuccesses >= minSuccesses && totalSuccesses+totalPermanentFailures == totalCount { logger.Infof(ctx, "Array succeeded because totalSuccesses[%v] >= minSuccesses[%v]", totalSuccesses, minSuccesses) return PhaseWriteToDiscovery } diff --git a/flyteplugins/go/tasks/plugins/array/core/state_test.go b/flyteplugins/go/tasks/plugins/array/core/state_test.go index 26a80531b5..01b5b41528 100644 --- a/flyteplugins/go/tasks/plugins/array/core/state_test.go +++ b/flyteplugins/go/tasks/plugins/array/core/state_test.go @@ -349,6 +349,24 @@ func TestSummaryToPhase(t *testing.T) { core.PhaseRetryableFailure: 5, }, }, + { + // complete retry even though minSuccesses is achieved + "RetryMinSuccessRatio", + PhaseCheckingSubTaskExecutions, + map[core.Phase]int64{ + core.PhaseSuccess: 10, + core.PhaseRetryableFailure: 1, + }, + }, + { + // ensure all tasks are executed even if minSuccesses is achieved + "ExecuteAllMinSuccessRatio", + PhaseCheckingSubTaskExecutions, + map[core.Phase]int64{ + core.PhaseSuccess: 10, + core.PhaseUndefined: 1, + }, + }, } for _, tt := range tests { From bec7bbb59c6f7891ce6da9793de071636e3dba9a Mon Sep 17 00:00:00 2001 From: Yee Hing Tong Date: Wed, 8 Nov 2023 10:09:03 +0800 Subject: [PATCH 29/43] add a short comment (#4341) Signed-off-by: Yee Hing Tong --- flyteadmin/auth/token.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/flyteadmin/auth/token.go b/flyteadmin/auth/token.go index 9b90a2d108..97029c870a 100644 --- a/flyteadmin/auth/token.go +++ b/flyteadmin/auth/token.go @@ -131,7 +131,12 @@ func IdentityContextFromIDTokenToken(ctx context.Context, tokenStr, clientID str logger.Infof(ctx, "Failed to unmarshal claims from id token, err: %v", err) } - // TODO: Document why automatically specify "all" scope + // This path is used when a user logs into the UI and when you login through the UI, you should have all the capabilities your identity + // allows you to have, which is denoted by the "all" scope. + // There was a plan to one day define one of a handful of scopes (all, proj admin, user, viewer) and if you configure your IDP + // to issue the right scopes, admin can do very light weight 'AuthZ' on admin based on these scopes, but until that plan is effected, + // we just use this single scope that Admin expects for all methods + // And because not all IdPs allow us to configure the Identity Token claims, the scope needs to live here. return NewIdentityContext(idToken.Audience[0], idToken.Subject, "", idToken.IssuedAt, sets.NewString(ScopeAll), userInfo, claims) } From d365c116ca0bd03d5bede3600ef6107d81bc4444 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Tue, 7 Nov 2023 22:48:14 -0800 Subject: [PATCH 30/43] Update tests (#4381) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- .github/workflows/end2end.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml index ad21cf2cf1..cee310b195 100644 --- a/.github/workflows/end2end.yml +++ b/.github/workflows/end2end.yml @@ -87,9 +87,8 @@ jobs: advanced_composition/advanced_composition/dynamics.py \ advanced_composition/advanced_composition/map_task.py \ advanced_composition/advanced_composition/subworkflows.py \ - data_types_and_io/data_types_and_io/custom_objects.py \ - data_types_and_io/data_types_and_io/schema.py \ - data_types_and_io/data_types_and_io/typed_schema.py ; + data_types_and_io/data_types_and_io/dataclass.py \ + data_types_and_io/data_types_and_io/structured_dataset.py ; do pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \ register \ From feeda193f285116a207342d17b173217650f3b0a Mon Sep 17 00:00:00 2001 From: Nikki Everett Date: Wed, 8 Nov 2023 03:20:46 -0600 Subject: [PATCH 31/43] Update order of cluster resources config to work with both uctl and flytectl (#4373) * update order of cluster resources config to work with both uctl and flytectl Signed-off-by: nikki everett * remove extraneous blank line Signed-off-by: nikki everett --------- Signed-off-by: nikki everett --- rsts/deployment/configuration/general.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rsts/deployment/configuration/general.rst b/rsts/deployment/configuration/general.rst index 32e162afe8..c61d096930 100644 --- a/rsts/deployment/configuration/general.rst +++ b/rsts/deployment/configuration/general.rst @@ -105,11 +105,11 @@ Define an attributes file, ``cra.yaml``: .. code-block:: yaml + domain: development + project: flyteexamples attributes: projectQuotaCpu: "1000" projectQuotaMemory: 5Ti - domain: development - project: flyteexamples To ensure that the overrides reflect in the Kubernetes namespace ``flyteexamples-development`` (that is, the namespace has a resource quota of From 6b5994ff1d529416113b790897367a7c847c4650 Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Wed, 8 Nov 2023 05:25:03 -0800 Subject: [PATCH 32/43] Update tests in single-binary (#4383) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- .github/workflows/single-binary.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/single-binary.yml b/.github/workflows/single-binary.yml index c75892a292..630068da00 100644 --- a/.github/workflows/single-binary.yml +++ b/.github/workflows/single-binary.yml @@ -201,9 +201,8 @@ jobs: advanced_composition/advanced_composition/dynamics.py \ advanced_composition/advanced_composition/map_task.py \ advanced_composition/advanced_composition/subworkflows.py \ - data_types_and_io/data_types_and_io/custom_objects.py \ - data_types_and_io/data_types_and_io/schema.py \ - data_types_and_io/data_types_and_io/typed_schema.py ; + data_types_and_io/data_types_and_io/dataclass.py \ + data_types_and_io/data_types_and_io/structured_dataset.py ; do pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \ register \ From bda9acaf6e6b6f20706b6eb277c1c97f1599a48b Mon Sep 17 00:00:00 2001 From: Jeev B Date: Wed, 8 Nov 2023 07:48:48 -0800 Subject: [PATCH 33/43] =?UTF-8?q?Passthrough=20unique=20node=20ID=20in=20t?= =?UTF-8?q?ask=20execution=20ID=20for=20generating=20log=20te=E2=80=A6=20(?= =?UTF-8?q?#4380)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Passthrough unique node ID in task execution ID for generating log template vars Signed-off-by: Jeev B * Fix failing test Signed-off-by: Jeev B * Fix linting issue Signed-off-by: Jeev B --------- Signed-off-by: Jeev B --- flyteplugins/go/tasks/logs/logging_utils.go | 5 +- .../go/tasks/logs/logging_utils_test.go | 59 +++++++----- .../pluginmachinery/core/exec_metadata.go | 4 + .../core/mocks/task_execution_id.go | 32 +++++++ .../flytek8s/k8s_resource_adds_test.go | 4 + .../tasks/pluginmachinery/tasklog/plugin.go | 3 +- .../tasks/pluginmachinery/tasklog/template.go | 96 +++++++++---------- .../pluginmachinery/tasklog/template_test.go | 93 +++++++----------- .../go/tasks/plugins/k8s/dask/dask.go | 10 +- .../k8s/kfoperators/common/common_operator.go | 50 +++++----- .../go/tasks/plugins/k8s/pod/plugin.go | 6 +- flyteplugins/go/tasks/plugins/k8s/ray/ray.go | 6 +- .../go/tasks/plugins/k8s/spark/spark.go | 34 +++---- flyteplugins/tests/end_to_end.go | 1 + .../controller/nodes/task/taskexec_context.go | 23 +++-- .../nodes/task/taskexec_context_test.go | 82 +++++++++++----- .../controller/nodes/task/transformer_test.go | 2 + 17 files changed, 289 insertions(+), 221 deletions(-) diff --git a/flyteplugins/go/tasks/logs/logging_utils.go b/flyteplugins/go/tasks/logs/logging_utils.go index 0ca515d7c8..6af1889e9f 100644 --- a/flyteplugins/go/tasks/logs/logging_utils.go +++ b/flyteplugins/go/tasks/logs/logging_utils.go @@ -8,6 +8,7 @@ import ( v1 "k8s.io/api/core/v1" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginsCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flytestdlib/logger" ) @@ -18,7 +19,7 @@ type logPlugin struct { } // Internal -func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, taskExecID *core.TaskExecutionIdentifier, pod *v1.Pod, index uint32, nameSuffix string, extraLogTemplateVarsByScheme *tasklog.TemplateVarsByScheme) ([]*core.TaskLog, error) { +func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, taskExecID pluginsCore.TaskExecutionID, pod *v1.Pod, index uint32, nameSuffix string, extraLogTemplateVarsByScheme *tasklog.TemplateVarsByScheme) ([]*core.TaskLog, error) { if logPlugin == nil { return nil, nil } @@ -53,7 +54,7 @@ func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, tas PodRFC3339FinishTime: time.Unix(finishTime, 0).Format(time.RFC3339), PodUnixStartTime: startTime, PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: taskExecID, + TaskExecutionID: taskExecID, ExtraTemplateVarsByScheme: extraLogTemplateVarsByScheme, }, ) diff --git a/flyteplugins/go/tasks/logs/logging_utils_test.go b/flyteplugins/go/tasks/logs/logging_utils_test.go index fbf86b9933..066fdd96c8 100644 --- a/flyteplugins/go/tasks/logs/logging_utils_test.go +++ b/flyteplugins/go/tasks/logs/logging_utils_test.go @@ -10,34 +10,41 @@ import ( v12 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" + coreMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core/mocks" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" ) const podName = "PodName" -var dummyTaskExecID = &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", +func dummyTaskExecID() pluginCore.TaskExecutionID { + tID := &coreMocks.TaskExecutionID{} + tID.OnGetGeneratedName().Return("generated-name") + tID.OnGetID().Return(core.TaskExecutionIdentifier{ + TaskId: &core.Identifier{ + ResourceType: core.ResourceType_TASK, + Name: "my-task-name", + Project: "my-task-project", + Domain: "my-task-domain", + Version: "1", }, - }, - RetryAttempt: 1, + NodeExecutionId: &core.NodeExecutionIdentifier{ + ExecutionId: &core.WorkflowExecutionIdentifier{ + Name: "my-execution-name", + Project: "my-execution-project", + Domain: "my-execution-domain", + }, + }, + RetryAttempt: 1, + }) + tID.OnGetUniqueNodeID().Return("n0-0-n0") + return tID } func TestGetLogsForContainerInPod_NoPlugins(t *testing.T) { logPlugin, err := InitializeLogPlugins(&LogConfig{}) assert.NoError(t, err) - l, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, nil, 0, " Suffix", nil) + l, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), nil, 0, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, l) } @@ -49,7 +56,7 @@ func TestGetLogsForContainerInPod_NoLogs(t *testing.T) { CloudwatchLogGroup: "/kubernetes/flyte-production", }) assert.NoError(t, err) - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, nil, 0, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), nil, 0, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -80,7 +87,7 @@ func TestGetLogsForContainerInPod_BadIndex(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 1, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 1, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -105,7 +112,7 @@ func TestGetLogsForContainerInPod_MissingStatus(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 1, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 1, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -135,7 +142,7 @@ func TestGetLogsForContainerInPod_Cloudwatch(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -165,7 +172,7 @@ func TestGetLogsForContainerInPod_K8s(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -198,7 +205,7 @@ func TestGetLogsForContainerInPod_All(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 2) } @@ -229,7 +236,7 @@ func TestGetLogsForContainerInPod_Stackdriver(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -303,7 +310,7 @@ func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*c }, } - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " my-Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " my-Suffix", nil) assert.Nil(tb, err) assert.Len(tb, logs, len(expectedTaskLogs)) if diff := deep.Equal(logs, expectedTaskLogs); len(diff) > 0 { @@ -337,7 +344,7 @@ func TestGetLogsForContainerInPod_Templates(t *testing.T) { Name: "StackDriver my-Suffix", }, { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/1/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "Internal my-Suffix", }, diff --git a/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go b/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go index 8517b9c385..9ac650baaa 100644 --- a/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go +++ b/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go @@ -27,6 +27,10 @@ type TaskExecutionID interface { // GetID returns the underlying idl task identifier. GetID() core.TaskExecutionIdentifier + + // GetUniqueNodeID returns the fully-qualified Node ID that is unique within a + // given workflow execution. + GetUniqueNodeID() string } // TaskExecutionMetadata represents any execution information for a Task. It is used to communicate meta information about the diff --git a/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go b/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go index 7db5590170..44596bf82f 100644 --- a/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go +++ b/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go @@ -114,3 +114,35 @@ func (_m *TaskExecutionID) GetID() flyteidlcore.TaskExecutionIdentifier { return r0 } + +type TaskExecutionID_GetUniqueNodeID struct { + *mock.Call +} + +func (_m TaskExecutionID_GetUniqueNodeID) Return(_a0 string) *TaskExecutionID_GetUniqueNodeID { + return &TaskExecutionID_GetUniqueNodeID{Call: _m.Call.Return(_a0)} +} + +func (_m *TaskExecutionID) OnGetUniqueNodeID() *TaskExecutionID_GetUniqueNodeID { + c_call := _m.On("GetUniqueNodeID") + return &TaskExecutionID_GetUniqueNodeID{Call: c_call} +} + +func (_m *TaskExecutionID) OnGetUniqueNodeIDMatch(matchers ...interface{}) *TaskExecutionID_GetUniqueNodeID { + c_call := _m.On("GetUniqueNodeID", matchers...) + return &TaskExecutionID_GetUniqueNodeID{Call: c_call} +} + +// GetUniqueNodeID provides a mock function with given fields: +func (_m *TaskExecutionID) GetUniqueNodeID() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go index 5703b88d81..60a3833397 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go @@ -227,6 +227,10 @@ func (m mockTaskExecutionIdentifier) GetGeneratedName() string { return "task-exec-name" } +func (m mockTaskExecutionIdentifier) GetUniqueNodeID() string { + return "unique-node-id" +} + func TestDecorateEnvVars(t *testing.T) { ctx := context.Background() ctx = contextutils.WithWorkflowID(ctx, "fake_workflow") diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go index 0ca91c3370..b812221f6d 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go @@ -4,6 +4,7 @@ import ( "regexp" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginsCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" ) //go:generate enumer --type=TemplateScheme --trimprefix=TemplateScheme -json -yaml @@ -42,7 +43,7 @@ type Input struct { PodUnixStartTime int64 PodUnixFinishTime int64 PodUID string - TaskExecutionIdentifier *core.TaskExecutionIdentifier + TaskExecutionID pluginsCore.TaskExecutionID ExtraTemplateVarsByScheme *TemplateVarsByScheme } diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go index 2a68f42cff..77c49d2695 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go @@ -114,55 +114,55 @@ func (input Input) templateVarsForScheme(scheme TemplateScheme) TemplateVars { vars = append(vars, input.ExtraTemplateVarsByScheme.Pod...) } case TemplateSchemeTaskExecution: - if input.TaskExecutionIdentifier != nil { - vars = append(vars, TemplateVar{ + taskExecutionIdentifier := input.TaskExecutionID.GetID() + vars = append( + vars, + TemplateVar{ + defaultRegexes.NodeID, + input.TaskExecutionID.GetUniqueNodeID(), + }, + TemplateVar{ defaultRegexes.TaskRetryAttempt, - strconv.FormatUint(uint64(input.TaskExecutionIdentifier.RetryAttempt), 10), - }) - if input.TaskExecutionIdentifier.TaskId != nil { - vars = append( - vars, - TemplateVar{ - defaultRegexes.TaskID, - input.TaskExecutionIdentifier.TaskId.Name, - }, - TemplateVar{ - defaultRegexes.TaskVersion, - input.TaskExecutionIdentifier.TaskId.Version, - }, - TemplateVar{ - defaultRegexes.TaskProject, - input.TaskExecutionIdentifier.TaskId.Project, - }, - TemplateVar{ - defaultRegexes.TaskDomain, - input.TaskExecutionIdentifier.TaskId.Domain, - }, - ) - } - if input.TaskExecutionIdentifier.NodeExecutionId != nil { - vars = append(vars, TemplateVar{ - defaultRegexes.NodeID, - input.TaskExecutionIdentifier.NodeExecutionId.NodeId, - }) - if input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId != nil { - vars = append( - vars, - TemplateVar{ - defaultRegexes.ExecutionName, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Name, - }, - TemplateVar{ - defaultRegexes.ExecutionProject, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Project, - }, - TemplateVar{ - defaultRegexes.ExecutionDomain, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Domain, - }, - ) - } - } + strconv.FormatUint(uint64(taskExecutionIdentifier.RetryAttempt), 10), + }, + ) + if taskExecutionIdentifier.TaskId != nil { + vars = append( + vars, + TemplateVar{ + defaultRegexes.TaskID, + taskExecutionIdentifier.TaskId.Name, + }, + TemplateVar{ + defaultRegexes.TaskVersion, + taskExecutionIdentifier.TaskId.Version, + }, + TemplateVar{ + defaultRegexes.TaskProject, + taskExecutionIdentifier.TaskId.Project, + }, + TemplateVar{ + defaultRegexes.TaskDomain, + taskExecutionIdentifier.TaskId.Domain, + }, + ) + } + if taskExecutionIdentifier.NodeExecutionId != nil && taskExecutionIdentifier.NodeExecutionId.ExecutionId != nil { + vars = append( + vars, + TemplateVar{ + defaultRegexes.ExecutionName, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Name, + }, + TemplateVar{ + defaultRegexes.ExecutionProject, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Project, + }, + TemplateVar{ + defaultRegexes.ExecutionDomain, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Domain, + }, + ) } if gotExtraTemplateVars { vars = append(vars, input.ExtraTemplateVarsByScheme.TaskExecution...) diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go index e3f03047aa..320ece05a4 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go @@ -9,6 +9,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" + coreMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core/mocks" ) func TestTemplateLog(t *testing.T) { @@ -38,6 +40,30 @@ func Benchmark_initDefaultRegexes(b *testing.B) { } } +func dummyTaskExecID() pluginCore.TaskExecutionID { + tID := &coreMocks.TaskExecutionID{} + tID.OnGetGeneratedName().Return("generated-name") + tID.OnGetID().Return(core.TaskExecutionIdentifier{ + TaskId: &core.Identifier{ + ResourceType: core.ResourceType_TASK, + Name: "my-task-name", + Project: "my-task-project", + Domain: "my-task-domain", + Version: "1", + }, + NodeExecutionId: &core.NodeExecutionIdentifier{ + ExecutionId: &core.WorkflowExecutionIdentifier{ + Name: "my-execution-name", + Project: "my-execution-project", + Domain: "my-execution-domain", + }, + }, + RetryAttempt: 1, + }) + tID.OnGetUniqueNodeID().Return("n0-0-n0") + return tID +} + func Test_Input_templateVarsForScheme(t *testing.T) { testRegexes := struct { Foo *regexp.Regexp @@ -66,25 +92,8 @@ func Test_Input_templateVarsForScheme(t *testing.T) { PodUnixFinishTime: 12345, } taskExecutionBase := Input{ - LogName: "main_logs", - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + LogName: "main_logs", + TaskExecutionID: dummyTaskExecID(), } tests := []struct { @@ -162,12 +171,12 @@ func Test_Input_templateVarsForScheme(t *testing.T) { nil, TemplateVars{ {defaultRegexes.LogName, "main_logs"}, - {defaultRegexes.TaskRetryAttempt, "0"}, + {defaultRegexes.NodeID, "n0-0-n0"}, + {defaultRegexes.TaskRetryAttempt, "1"}, {defaultRegexes.TaskID, "my-task-name"}, {defaultRegexes.TaskVersion, "1"}, {defaultRegexes.TaskProject, "my-task-project"}, {defaultRegexes.TaskDomain, "my-task-domain"}, - {defaultRegexes.NodeID, "n0"}, {defaultRegexes.ExecutionName, "my-execution-name"}, {defaultRegexes.ExecutionProject, "my-execution-project"}, {defaultRegexes.ExecutionDomain, "my-execution-domain"}, @@ -484,30 +493,13 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", PodUnixStartTime: 123, PodUnixFinishTime: 12345, - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + TaskExecutionID: dummyTaskExecID(), }, }, Output{ TaskLogs: []*core.TaskLog{ { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/0/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "main_logs", }, @@ -534,24 +526,7 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", PodUnixStartTime: 123, PodUnixFinishTime: 12345, - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + TaskExecutionID: dummyTaskExecID(), ExtraTemplateVarsByScheme: &TemplateVarsByScheme{ TaskExecution: TemplateVars{ {MustCreateRegex("subtaskExecutionIndex"), "1"}, @@ -564,7 +539,7 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { Output{ TaskLogs: []*core.TaskLog{ { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/0/mappedIndex/1/mappedAttempt/1/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/0/mappedIndex/1/mappedAttempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "main_logs", }, diff --git a/flyteplugins/go/tasks/plugins/k8s/dask/dask.go b/flyteplugins/go/tasks/plugins/k8s/dask/dask.go index 65050f5bb2..eb27aec3ce 100644 --- a/flyteplugins/go/tasks/plugins/k8s/dask/dask.go +++ b/flyteplugins/go/tasks/plugins/k8s/dask/dask.go @@ -298,13 +298,13 @@ func (p daskResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s status == daskAPI.DaskJobClusterCreated if !isQueued { - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() o, err := logPlugin.GetTaskLogs( tasklog.Input{ - Namespace: job.ObjectMeta.Namespace, - PodName: job.Status.JobRunnerPodName, - LogName: "(User logs)", - TaskExecutionIdentifier: &taskExecID, + Namespace: job.ObjectMeta.Namespace, + PodName: job.Status.JobRunnerPodName, + LogName: "(User logs)", + TaskExecutionID: taskExecID, }, ) if err != nil { diff --git a/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go b/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go index e0903d02a3..594767b4b4 100644 --- a/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go +++ b/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go @@ -98,7 +98,7 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v namespace := objectMeta.Namespace taskLogs := make([]*core.TaskLog, 0, 10) - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() logPlugin, err := logs.InitializeLogPlugins(logs.GetLogConfig()) @@ -120,14 +120,14 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v if taskType == PytorchTaskType && hasMaster { masterTaskLog, masterErr := logPlugin.GetTaskLogs( tasklog.Input{ - PodName: name + "-master-0", - Namespace: namespace, - LogName: "master", - PodRFC3339StartTime: RFC3999StartTime, - PodRFC3339FinishTime: RFC3999FinishTime, - PodUnixStartTime: startTime, - PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: &taskExecID, + PodName: name + "-master-0", + Namespace: namespace, + LogName: "master", + PodRFC3339StartTime: RFC3999StartTime, + PodRFC3339FinishTime: RFC3999FinishTime, + PodUnixStartTime: startTime, + PodUnixFinishTime: finishTime, + TaskExecutionID: taskExecID, }, ) if masterErr != nil { @@ -139,13 +139,13 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get all workers log for workerIndex := int32(0); workerIndex < workersCount; workerIndex++ { workerLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-worker-%d", workerIndex), - Namespace: namespace, - PodRFC3339StartTime: RFC3999StartTime, - PodRFC3339FinishTime: RFC3999FinishTime, - PodUnixStartTime: startTime, - PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-worker-%d", workerIndex), + Namespace: namespace, + PodRFC3339StartTime: RFC3999StartTime, + PodRFC3339FinishTime: RFC3999FinishTime, + PodUnixStartTime: startTime, + PodUnixFinishTime: finishTime, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -160,9 +160,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get all parameter servers logs for psReplicaIndex := int32(0); psReplicaIndex < psReplicasCount; psReplicaIndex++ { psReplicaLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-psReplica-%d", psReplicaIndex), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-psReplica-%d", psReplicaIndex), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -172,9 +172,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get chief worker log, and the max number of chief worker is 1 if chiefReplicasCount != 0 { chiefReplicaLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-chiefReplica-%d", 0), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-chiefReplica-%d", 0), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -184,9 +184,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get evaluator log, and the max number of evaluator is 1 if evaluatorReplicasCount != 0 { evaluatorReplicasCount, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-evaluatorReplica-%d", 0), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-evaluatorReplica-%d", 0), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go index d1ba98bcaa..11de877021 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go @@ -164,9 +164,9 @@ func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.Plugin ReportedAt: &reportedAt, } - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown { - taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, &taskExecID, pod, 0, logSuffix, extraLogTemplateVarsByScheme) + taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, taskExecID, pod, 0, logSuffix, extraLogTemplateVarsByScheme) if err != nil { return pluginsCore.PhaseInfoUndefined, err } @@ -211,7 +211,7 @@ func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.Plugin } else { // if the primary container annotation exists, we use the status of the specified container phaseInfo = flytek8s.DeterminePrimaryContainerPhase(primaryContainerName, pod.Status.ContainerStatuses, &info) - if phaseInfo.Phase() == pluginsCore.PhasePermanentFailure && phaseInfo.Err() != nil && + if phaseInfo.Phase() == pluginsCore.PhasePermanentFailure && phaseInfo.Err() != nil && phaseInfo.Err().GetCode() == flytek8s.PrimaryContainerNotFound { // if the primary container status is not found ensure that the primary container exists. // note: it should be impossible for the primary container to not exist at this point. diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go index c1dcc2b8e2..cc8d198334 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go @@ -444,10 +444,10 @@ func getEventInfoForRayJob(logConfig logs.LogConfig, pluginContext k8s.PluginCon // TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs // RayJob CRD does not include the name of the worker or head pod for now - taskID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() logOutput, err := logPlugin.GetTaskLogs(tasklog.Input{ - Namespace: rayJob.Namespace, - TaskExecutionIdentifier: &taskID, + Namespace: rayJob.Namespace, + TaskExecutionID: taskExecID, }) if err != nil { diff --git a/flyteplugins/go/tasks/plugins/k8s/spark/spark.go b/flyteplugins/go/tasks/plugins/k8s/spark/spark.go index d0506ccfb5..e5fd14478a 100644 --- a/flyteplugins/go/tasks/plugins/k8s/spark/spark.go +++ b/flyteplugins/go/tasks/plugins/k8s/spark/spark.go @@ -329,7 +329,7 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl sparkConfig := GetSparkConfig() taskLogs := make([]*core.TaskLog, 0, 3) - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() if !isQueued { if sj.Status.DriverInfo.PodName != "" { @@ -340,10 +340,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Status.DriverInfo.PodName, - Namespace: sj.Namespace, - LogName: "(Driver Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Status.DriverInfo.PodName, + Namespace: sj.Namespace, + LogName: "(Driver Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -361,10 +361,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Status.DriverInfo.PodName, - Namespace: sj.Namespace, - LogName: "(User Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Status.DriverInfo.PodName, + Namespace: sj.Namespace, + LogName: "(User Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -381,10 +381,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Name, - Namespace: sj.Namespace, - LogName: "(System Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Name, + Namespace: sj.Namespace, + LogName: "(System Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -402,10 +402,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Name, - Namespace: sj.Namespace, - LogName: "(Spark-Submit/All User Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Name, + Namespace: sj.Namespace, + LogName: "(Spark-Submit/All User Logs)", + TaskExecutionID: taskExecID, }) if err != nil { diff --git a/flyteplugins/tests/end_to_end.go b/flyteplugins/tests/end_to_end.go index 603b4d3a30..037ae877d9 100644 --- a/flyteplugins/tests/end_to_end.go +++ b/flyteplugins/tests/end_to_end.go @@ -136,6 +136,7 @@ func RunPluginEndToEndTest(t *testing.T, executor pluginCore.Plugin, template *i }, RetryAttempt: 0, }) + tID.OnGetUniqueNodeID().Return("unique-node-id") overrides := &coreMocks.TaskOverrides{} overrides.OnGetConfig().Return(&v1.ConfigMap{Data: map[string]string{ diff --git a/flytepropeller/pkg/controller/nodes/task/taskexec_context.go b/flytepropeller/pkg/controller/nodes/task/taskexec_context.go index 8b819c79eb..4a6f9750a2 100644 --- a/flytepropeller/pkg/controller/nodes/task/taskexec_context.go +++ b/flytepropeller/pkg/controller/nodes/task/taskexec_context.go @@ -33,14 +33,19 @@ var ( const IDMaxLength = 50 type taskExecutionID struct { - execName string - id *core.TaskExecutionIdentifier + execName string + id *core.TaskExecutionIdentifier + uniqueNodeID string } func (te taskExecutionID) GetID() core.TaskExecutionIdentifier { return *te.id } +func (te taskExecutionID) GetUniqueNodeID() string { + return te.uniqueNodeID +} + func (te taskExecutionID) GetGeneratedName() string { return te.execName } @@ -291,11 +296,15 @@ func (t *Handler) newTaskExecutionContext(ctx context.Context, nCtx interfaces.N NodeExecutionContext: nCtx, tm: taskExecutionMetadata{ NodeExecutionMetadata: nCtx.NodeExecutionMetadata(), - taskExecID: taskExecutionID{execName: uniqueID, id: id}, - o: nCtx.Node(), - maxAttempts: maxAttempts, - platformResources: convertTaskResourcesToRequirements(nCtx.ExecutionContext().GetExecutionConfig().TaskResources), - environmentVariables: nCtx.ExecutionContext().GetExecutionConfig().EnvironmentVariables, + taskExecID: taskExecutionID{ + execName: uniqueID, + id: id, + uniqueNodeID: currentNodeUniqueID, + }, + o: nCtx.Node(), + maxAttempts: maxAttempts, + platformResources: convertTaskResourcesToRequirements(nCtx.ExecutionContext().GetExecutionConfig().TaskResources), + environmentVariables: nCtx.ExecutionContext().GetExecutionConfig().EnvironmentVariables, }, rm: resourcemanager.GetTaskResourceManager( t.resourceManager, resourceNamespacePrefix, id), diff --git a/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go b/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go index a8cbc7d6d3..b0106e8ff9 100644 --- a/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go +++ b/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go @@ -20,8 +20,10 @@ import ( "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/ioutils" "github.com/flyteorg/flyte/flytepropeller/pkg/apis/flyteworkflow/v1alpha1" flyteMocks "github.com/flyteorg/flyte/flytepropeller/pkg/apis/flyteworkflow/v1alpha1/mocks" + "github.com/flyteorg/flyte/flytepropeller/pkg/controller/executors" mocks2 "github.com/flyteorg/flyte/flytepropeller/pkg/controller/executors/mocks" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/handler" + "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/interfaces" nodeMocks "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/interfaces/mocks" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/task/codex" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/task/resourcemanager" @@ -31,15 +33,26 @@ import ( "github.com/flyteorg/flyte/flytestdlib/storage" ) -func TestHandler_newTaskExecutionContext(t *testing.T) { - wfExecID := &core.WorkflowExecutionIdentifier{ +type dummyPluginState struct { + A int +} + +var ( + wfExecID = &core.WorkflowExecutionIdentifier{ Project: "project", Domain: "domain", Name: "name", } + taskID = &core.Identifier{} + nodeID = "n1" + resources = &corev1.ResourceRequirements{ + Requests: make(corev1.ResourceList), + Limits: make(corev1.ResourceList), + } + dummyPluginStateA = 45 +) - nodeID := "n1" - +func dummyNodeExecutionContext(t *testing.T, parentInfo executors.ImmutableParentInfo, eventVersion v1alpha1.EventVersion) interfaces.NodeExecutionContext { nm := &nodeMocks.NodeExecutionMetadata{} nm.OnGetAnnotations().Return(map[string]string{}) nm.OnGetNodeExecutionID().Return(&core.NodeExecutionIdentifier{ @@ -55,7 +68,6 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { Name: "name", }) - taskID := &core.Identifier{} tr := &nodeMocks.TaskReader{} tr.OnGetTaskID().Return(taskID) @@ -63,12 +75,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { ns.OnGetDataDir().Return("data-dir") ns.OnGetOutputDir().Return("output-dir") - res := &corev1.ResourceRequirements{ - Requests: make(corev1.ResourceList), - Limits: make(corev1.ResourceList), - } n := &flyteMocks.ExecutableNode{} - n.OnGetResources().Return(res) + n.OnGetResources().Return(resources) ma := 5 n.OnGetRetryStrategy().Return(&v1alpha1.RetryStrategy{MinAttempts: &ma}) @@ -87,8 +95,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { executionContext := &mocks2.ExecutionContext{} executionContext.OnGetExecutionConfig().Return(v1alpha1.ExecutionConfig{}) - executionContext.OnGetParentInfo().Return(nil) - executionContext.OnGetEventVersion().Return(v1alpha1.EventVersion0) + executionContext.OnGetParentInfo().Return(parentInfo) + executionContext.OnGetEventVersion().Return(eventVersion) nCtx.OnExecutionContext().Return(executionContext) ds, err := storage.NewDataStore( @@ -101,12 +109,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { nCtx.OnDataStore().Return(ds) st := bytes.NewBuffer([]byte{}) - a := 45 - type test struct { - A int - } codex := codex.GobStateCodec{} - assert.NoError(t, codex.Encode(test{A: a}, st)) + assert.NoError(t, codex.Encode(dummyPluginState{A: dummyPluginStateA}, st)) nr := &nodeMocks.NodeStateReader{} nr.OnGetTaskNodeState().Return(handler.TaskNodeState{ PluginState: st.Bytes(), @@ -114,28 +118,39 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { nCtx.OnNodeStateReader().Return(nr) nCtx.OnRawOutputPrefix().Return("s3://sandbox/") nCtx.OnOutputShardSelector().Return(ioutils.NewConstantShardSelector([]string{"x"})) + return nCtx +} - noopRm := CreateNoopResourceManager(context.TODO(), promutils.NewTestScope()) +func dummyPlugin() pluginCore.Plugin { + p := &pluginCoreMocks.Plugin{} + p.On("GetID").Return("plugin1") + p.OnGetProperties().Return(pluginCore.PluginProperties{}) + return p +} +func dummyHandler() *Handler { + noopRm := CreateNoopResourceManager(context.TODO(), promutils.NewTestScope()) c := &mocks.Client{} - tk := &Handler{ + return &Handler{ catalog: c, secretManager: secretmanager.NewFileEnvSecretManager(secretmanager.GetConfig()), resourceManager: noopRm, } +} - p := &pluginCoreMocks.Plugin{} - p.On("GetID").Return("plugin1") - p.OnGetProperties().Return(pluginCore.PluginProperties{}) +func TestHandler_newTaskExecutionContext(t *testing.T) { + nCtx := dummyNodeExecutionContext(t, nil, v1alpha1.EventVersion0) + p := dummyPlugin() + tk := dummyHandler() got, err := tk.newTaskExecutionContext(context.TODO(), nCtx, p) assert.NoError(t, err) assert.NotNil(t, got) - f := &test{} + f := &dummyPluginState{} v, err := got.PluginStateReader().Get(f) assert.NoError(t, err) assert.Equal(t, v, uint8(0)) - assert.Equal(t, f.A, a) + assert.Equal(t, f.A, dummyPluginStateA) // Try writing new state type test2 struct { @@ -151,13 +166,14 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { assert.NotNil(t, got.SecretManager()) assert.NotNil(t, got.OutputWriter()) - assert.Equal(t, got.TaskExecutionMetadata().GetOverrides().GetResources(), res) + assert.Equal(t, got.TaskExecutionMetadata().GetOverrides().GetResources(), resources) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), "name-n1-1") assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().TaskId, taskID) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt, uint32(1)) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().NodeExecutionId.GetNodeId(), nodeID) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().NodeExecutionId.GetExecutionId(), wfExecID) + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetUniqueNodeID(), nodeID) assert.EqualValues(t, got.ResourceManager().(resourcemanager.TaskResourceManager).GetResourcePoolInfo(), make([]*event.ResourcePoolInfo, 0)) @@ -195,6 +211,22 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { assert.NotNil(t, anotherTaskExecCtx.tr) } +func TestHandler_newTaskExecutionContext_taskExecutionID_WithParentInfo(t *testing.T) { + parentInfo := &mocks2.ImmutableParentInfo{} + parentInfo.OnGetUniqueID().Return("n0") + parentInfo.OnCurrentAttempt().Return(uint32(2)) + + nCtx := dummyNodeExecutionContext(t, parentInfo, v1alpha1.EventVersion1) + p := dummyPlugin() + tk := dummyHandler() + got, err := tk.newTaskExecutionContext(context.TODO(), nCtx, p) + assert.NoError(t, err) + assert.NotNil(t, got) + + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), "name-n0-2-n1-1") + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetUniqueNodeID(), "n0-2-n1") +} + func TestGetGeneratedNameWith(t *testing.T) { t.Run("length 0", func(t *testing.T) { tCtx := taskExecutionID{ diff --git a/flytepropeller/pkg/controller/nodes/task/transformer_test.go b/flytepropeller/pkg/controller/nodes/task/transformer_test.go index a26705baee..db89dda3e6 100644 --- a/flytepropeller/pkg/controller/nodes/task/transformer_test.go +++ b/flytepropeller/pkg/controller/nodes/task/transformer_test.go @@ -67,6 +67,7 @@ func TestToTaskExecutionEvent(t *testing.T) { generatedName := "generated_name" tID.OnGetGeneratedName().Return(generatedName) tID.OnGetID().Return(*id) + tID.OnGetUniqueNodeID().Return("unique-node-id") tMeta := &pluginMocks.TaskExecutionMetadata{} tMeta.OnGetTaskExecutionID().Return(tID) @@ -261,6 +262,7 @@ func TestToTaskExecutionEventWithParent(t *testing.T) { generatedName := "generated_name" tID.OnGetGeneratedName().Return(generatedName) tID.OnGetID().Return(*id) + tID.OnGetUniqueNodeID().Return("unique-node-id") tMeta := &pluginMocks.TaskExecutionMetadata{} tMeta.OnGetTaskExecutionID().Return(tID) From 630724f57a3fa33379fdb3ca116bcca4990e8188 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 9 Nov 2023 02:36:08 +0800 Subject: [PATCH 34/43] Add Sections in the PR Template (#4367) * add pr templates Signed-off-by: Future Outlier * nit Signed-off-by: Future Outlier * nit Signed-off-by: Future Outlier --------- Signed-off-by: Future Outlier Co-authored-by: Future Outlier --- .github/PULL_REQUEST_TEMPLATE.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7e867ae713..13d4ff7668 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,6 +12,10 @@ Happy contributing! +## Docs link + + + ## Describe your changes @@ -29,6 +33,10 @@ Happy contributing! - [ ] All new and existing tests passed. - [ ] All commits are signed-off. +## Setup Process + + + ## Screenshots @@ -36,3 +44,8 @@ Happy contributing! ## Note to reviewers + +## Related PRs + + + From dd91cbaee86f96582d79f8f2fda7a7f1b80f831e Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Wed, 8 Nov 2023 15:50:22 -0600 Subject: [PATCH 35/43] Update metadata in ArrayNode TaskExecutionEvents (#4355) * adding PluginIdentifier to TaskExecutionMetadata Signed-off-by: Daniel Rammer * sending input and output metadata - removing validating check on flyteadmin temporarily Signed-off-by: Daniel Rammer * readded task execution id verification Signed-off-by: Daniel Rammer * removed dead code Signed-off-by: Daniel Rammer --------- Signed-off-by: Daniel Rammer --- .../controller/nodes/array/event_recorder.go | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/flytepropeller/pkg/controller/nodes/array/event_recorder.go b/flytepropeller/pkg/controller/nodes/array/event_recorder.go index 55cbbce89f..c4e3004011 100644 --- a/flytepropeller/pkg/controller/nodes/array/event_recorder.go +++ b/flytepropeller/pkg/controller/nodes/array/event_recorder.go @@ -120,11 +120,39 @@ func (e *externalResourcesEventRecorder) finalize(ctx context.Context, nCtx inte OccurredAt: occurredAt, Metadata: &event.TaskExecutionMetadata{ ExternalResources: e.externalResources, + PluginIdentifier: "container", }, TaskType: "k8s-array", EventVersion: 1, } + // only attach input values if taskPhase is QUEUED meaning this the first evaluation + if taskPhase == idlcore.TaskExecution_QUEUED { + if eventConfig.RawOutputPolicy == config.RawOutputPolicyInline { + // pass inputs by value + literalMap, err := nCtx.InputReader().Get(ctx) + if err != nil { + return err + } + + taskExecutionEvent.InputValue = &event.TaskExecutionEvent_InputData{ + InputData: literalMap, + } + } else { + // pass inputs by reference + taskExecutionEvent.InputValue = &event.TaskExecutionEvent_InputUri{ + InputUri: nCtx.InputReader().GetInputPath().String(), + } + } + } + + // only attach output uri if taskPhase is SUCCEEDED + if taskPhase == idlcore.TaskExecution_SUCCEEDED { + taskExecutionEvent.OutputResult = &event.TaskExecutionEvent_OutputUri{ + OutputUri: v1alpha1.GetOutputsFile(nCtx.NodeStatus().GetOutputDir()).String(), + } + } + // record TaskExecutionEvent return e.EventRecorder.RecordTaskEvent(ctx, taskExecutionEvent, eventConfig) } From fee63b6563d80539a9df21386c0a7e0acf5e73ea Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Nov 2023 09:00:03 -0500 Subject: [PATCH 36/43] Fixes list formatting in flytepropeller arch docs (#4345) Signed-off-by: Thomas J. Fan --- .../component_architecture/flytepropeller_architecture.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/rsts/concepts/component_architecture/flytepropeller_architecture.rst b/rsts/concepts/component_architecture/flytepropeller_architecture.rst index 4fa3cc2cd6..a04f6dbe4d 100644 --- a/rsts/concepts/component_architecture/flytepropeller_architecture.rst +++ b/rsts/concepts/component_architecture/flytepropeller_architecture.rst @@ -13,6 +13,7 @@ Introduction ============ A Flyte :ref:`workflow ` is represented as a Directed Acyclic Graph (DAG) of interconnected Nodes. Flyte supports a robust collection of Node types to ensure diverse functionality. + - ``TaskNodes`` support a plugin system to externally add system integrations. - Control flow can be altered during runtime using ``BranchNodes``, which prune downstream evaluation paths based on input. - ``DynamicNodes`` add nodes to the DAG. From 243a8cb3e38b09307c40bc24b206f408a1192d9f Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Thu, 9 Nov 2023 10:48:51 -0600 Subject: [PATCH 37/43] update boilerplate end2end tests (#4393) Signed-off-by: Daniel Rammer --- boilerplate/flyte/end2end/run-tests.py | 111 ++++++++++++++++++++----- 1 file changed, 88 insertions(+), 23 deletions(-) diff --git a/boilerplate/flyte/end2end/run-tests.py b/boilerplate/flyte/end2end/run-tests.py index 5365da006e..9f8a8e85cb 100644 --- a/boilerplate/flyte/end2end/run-tests.py +++ b/boilerplate/flyte/end2end/run-tests.py @@ -5,7 +5,7 @@ import sys import time import traceback -from typing import Dict, List, Mapping, Tuple +from typing import Dict, List, Mapping, Tuple, Optional import click import requests @@ -50,15 +50,17 @@ ("basics.named_outputs.simple_wf_with_named_outputs", {}), # # Getting a 403 for the wikipedia image # # ("basics.reference_task.wf", {}), - ("data_types_and_io.custom_objects.wf", {"x": 10, "y": 20}), + ("data_types_and_io.dataclass.dataclass_wf", {"x": 10, "y": 20}), # Enums are not supported in flyteremote # ("type_system.enums.enum_wf", {"c": "red"}), - ("data_types_and_io.schema.df_wf", {"a": 42}), - ("data_types_and_io.typed_schema.wf", {}), + ("data_types_and_io.structured_dataset.simple_sd_wf", {"a": 42}), # ("my.imperative.workflow.example", {"in1": "hello", "in2": "foo"}), ], "integrations-k8s-spark": [ - ("k8s_spark_plugin.pyspark_pi.my_spark", {"triggered_date": datetime.datetime.now()}), + ( + "k8s_spark_plugin.pyspark_pi.my_spark", + {"triggered_date": datetime.datetime.now()}, + ), ], "integrations-kfpytorch": [ ("kfpytorch_plugin.pytorch_mnist.pytorch_training_wf", {}), @@ -89,20 +91,30 @@ } -def execute_workflow(remote, version, workflow_name, inputs): +def execute_workflow( + remote: FlyteRemote, + version, + workflow_name, + inputs, + cluster_pool_name: Optional[str] = None, +): print(f"Fetching workflow={workflow_name} and version={version}") wf = remote.fetch_workflow(name=workflow_name, version=version) - return remote.execute(wf, inputs=inputs, wait=False) + return remote.execute(wf, inputs=inputs, wait=False, cluster_pool=cluster_pool_name) -def executions_finished(executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]]) -> bool: +def executions_finished( + executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]] +) -> bool: for executions in executions_by_wfgroup.values(): if not all([execution.is_done for execution in executions]): return False return True -def sync_executions(remote: FlyteRemote, executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]]): +def sync_executions( + remote: FlyteRemote, executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]] +): try: for executions in executions_by_wfgroup.values(): for execution in executions: @@ -125,6 +137,7 @@ def schedule_workflow_groups( workflow_groups: List[str], remote: FlyteRemote, terminate_workflow_on_failure: bool, + cluster_pool_name: Optional[str] = None, ) -> Dict[str, bool]: """ Schedule workflows executions for all workflow groups and return True if all executions succeed, otherwise @@ -135,14 +148,19 @@ def schedule_workflow_groups( for wf_group in workflow_groups: workflows = FLYTESNACKS_WORKFLOW_GROUPS.get(wf_group, []) executions_by_wfgroup[wf_group] = [ - execute_workflow(remote, tag, workflow[0], workflow[1]) for workflow in workflows + execute_workflow(remote, tag, workflow[0], workflow[1], cluster_pool_name) + for workflow in workflows ] # Wait for all executions to finish attempt = 0 - while attempt == 0 or (not executions_finished(executions_by_wfgroup) and attempt < MAX_ATTEMPTS): + while attempt == 0 or ( + not executions_finished(executions_by_wfgroup) and attempt < MAX_ATTEMPTS + ): attempt += 1 - print(f"Not all executions finished yet. Sleeping for some time, will check again in {WAIT_TIME}s") + print( + f"Not all executions finished yet. Sleeping for some time, will check again in {WAIT_TIME}s" + ) time.sleep(WAIT_TIME) sync_executions(remote, executions_by_wfgroup) @@ -158,9 +176,13 @@ def schedule_workflow_groups( if len(non_succeeded_executions) != 0: print(f"Failed executions for {wf_group}:") for execution in non_succeeded_executions: - print(f" workflow={execution.spec.launch_plan.name}, execution_id={execution.id.name}") + print( + f" workflow={execution.spec.launch_plan.name}, execution_id={execution.id.name}" + ) if terminate_workflow_on_failure: - remote.terminate(execution, "aborting execution scheduled in functional test") + remote.terminate( + execution, "aborting execution scheduled in functional test" + ) # A workflow group succeeds iff all of its executions succeed results[wf_group] = len(non_succeeded_executions) == 0 return results @@ -179,17 +201,21 @@ def run( priorities: List[str], config_file_path, terminate_workflow_on_failure: bool, + test_project_name: str, + test_project_domain: str, + cluster_pool_name: Optional[str] = None, ) -> List[Dict[str, str]]: remote = FlyteRemote( Config.auto(config_file=config_file_path), - default_project="flytesnacks", - default_domain="development", + test_project_name, + test_project_domain, ) # For a given release tag and priority, this function filters the workflow groups from the flytesnacks # manifest file. For example, for the release tag "v0.2.224" and the priority "P0" it returns [ "core" ]. manifest_url = ( - "https://raw.githubusercontent.com/flyteorg/flytesnacks/" f"{flytesnacks_release_tag}/flyte_tests_manifest.json" + "https://raw.githubusercontent.com/flyteorg/flytesnacks/" + f"{flytesnacks_release_tag}/flyte_tests_manifest.json" ) r = requests.get(manifest_url) parsed_manifest = r.json() @@ -197,7 +223,11 @@ def run( workflow_groups = ( ["lite"] if "lite" in priorities - else [group["name"] for group in parsed_manifest if group["priority"] in priorities] + else [ + group["name"] + for group in parsed_manifest + if group["priority"] in priorities + ] ) results = [] @@ -215,7 +245,11 @@ def run( valid_workgroups.append(workflow_group) results_by_wfgroup = schedule_workflow_groups( - flytesnacks_release_tag, valid_workgroups, remote, terminate_workflow_on_failure + flytesnacks_release_tag, + valid_workgroups, + remote, + terminate_workflow_on_failure, + cluster_pool_name, ) for workflow_group, succeeded in results_by_wfgroup.items(): @@ -246,6 +280,9 @@ def run( @click.command() +@click.argument("flytesnacks_release_tag") +@click.argument("priorities") +@click.argument("config_file") @click.option( "--return_non_zero_on_failure", default=False, @@ -258,18 +295,46 @@ def run( is_flag=True, help="Abort failing workflows upon exit", ) -@click.argument("flytesnacks_release_tag") -@click.argument("priorities") -@click.argument("config_file") +@click.option( + "--test_project_name", + default="flytesnacks", + type=str, + is_flag=False, + help="Name of project to run functional tests on", +) +@click.option( + "--test_project_domain", + default="development", + type=str, + is_flag=False, + help="Name of domain in project to run functional tests on", +) +@click.argument( + "cluster_pool_name", + required=False, + type=str, + default=None, +) def cli( flytesnacks_release_tag, priorities, config_file, return_non_zero_on_failure, terminate_workflow_on_failure, + test_project_name, + test_project_domain, + cluster_pool_name, ): print(f"return_non_zero_on_failure={return_non_zero_on_failure}") - results = run(flytesnacks_release_tag, priorities, config_file, terminate_workflow_on_failure) + results = run( + flytesnacks_release_tag, + priorities, + config_file, + terminate_workflow_on_failure, + test_project_name, + test_project_domain, + cluster_pool_name, + ) # Write a json object in its own line describing the result of this run to stdout print(f"Result of run:\n{json.dumps(results)}") From 7712626d9cc638728bf471c61148fb8e22c63e44 Mon Sep 17 00:00:00 2001 From: Haytham Abuelfutuh Date: Fri, 10 Nov 2023 09:07:23 -0700 Subject: [PATCH 38/43] Handle all ray job statuses (#4389) Signed-off-by: Haytham Abuelfutuh --- flyteplugins/go/tasks/plugins/k8s/ray/ray.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go index cc8d198334..1d0fde4ca8 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go @@ -470,7 +470,7 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "Scheduling"), nil } - // Kuberay creates a Ray cluster first, and then submits a Ray job to the cluster + // KubeRay creates a Ray cluster first, and then submits a Ray job to the cluster switch rayJob.Status.JobDeploymentStatus { case rayv1alpha1.JobDeploymentStatusInitializing: return pluginsCore.PhaseInfoInitializing(rayJob.CreationTimestamp.Time, pluginsCore.DefaultPhaseVersion, "cluster is creating", info), nil @@ -480,7 +480,7 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont case rayv1alpha1.JobDeploymentStatusFailedJobDeploy: reason := fmt.Sprintf("Failed to submit Ray job %s with error: %s", rayJob.Name, rayJob.Status.Message) return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil - case rayv1alpha1.JobDeploymentStatusWaitForDashboard: + case rayv1alpha1.JobDeploymentStatusWaitForDashboard, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus: return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil case rayv1alpha1.JobDeploymentStatusRunning, rayv1alpha1.JobDeploymentStatusComplete: switch rayJob.Status.JobStatus { @@ -491,10 +491,19 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont return pluginsCore.PhaseInfoSuccess(info), nil case rayv1alpha1.JobStatusPending, rayv1alpha1.JobStatusRunning: return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil + case rayv1alpha1.JobStatusStopped: + // There is no current usage of this job status in KubeRay. It's unclear what it represents + fallthrough + default: + // We already handle all known job status, so this should never happen unless a future version of ray + // introduced a new job status. + return pluginsCore.PhaseInfoUndefined, fmt.Errorf("unknown job status: %s", rayJob.Status.JobStatus) } + default: + // We already handle all known deployment status, so this should never happen unless a future version of ray + // introduced a new job status. + return pluginsCore.PhaseInfoUndefined, fmt.Errorf("unknown job deployment status: %s", rayJob.Status.JobDeploymentStatus) } - - return pluginsCore.PhaseInfoUndefined, nil } func init() { From 48f53f0995c82568fb10ebf3ffead16c10424e99 Mon Sep 17 00:00:00 2001 From: David Espejo <82604841+davidmirror-ops@users.noreply.github.com> Date: Fri, 10 Nov 2023 11:36:41 -0500 Subject: [PATCH 39/43] Fix YAML indentation for sandbox config (#4385) Signed-off-by: davidmirror-ops --- rsts/deployment/deployment/sandbox.rst | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/rsts/deployment/deployment/sandbox.rst b/rsts/deployment/deployment/sandbox.rst index 5c40eea5eb..46809ae4a3 100644 --- a/rsts/deployment/deployment/sandbox.rst +++ b/rsts/deployment/deployment/sandbox.rst @@ -78,5 +78,35 @@ who wish to dig deeper into the storage layer. 🐋 Flyte sandbox ships with a Docker registry. Tag and push custom workflow images to localhost:30000 📂 The Minio API is hosted on localhost:30002. Use http://localhost:30080/minio/login for Minio console +Configuration +______________ + +The ``config-sandbox.yaml`` file contains configuration for **FlyteAdmin**, +which is the Flyte cluster backend component that processes all client requests +such as workflow executions. The default values are enough to let you connect and use Flyte: + + +.. code-block:: yaml + + admin: + # For GRPC endpoints you might want to use dns:///flyte.myexample.com + endpoint: localhost:30080 + authType: Pkce + insecure: true + console: + endpoint: http://localhost:30080 + logger: + show-source: true + level: 0 + +.. note:: + + You can also create your own config file with `flytectl config init`, which + will create a config file at `~/.flyte/config.yaml`. + + Learn more about the configuration settings in the + {ref}`Deployment Guide ` + + Now that you have the sandbox cluster running, you can now go to the :ref:`User Guide ` or :ref:`Tutorials ` to run tasks and workflows written in ``flytekit``, the Python SDK for Flyte. \ No newline at end of file From 09cb3b1d869a999281affb2ff5fe4358937bba75 Mon Sep 17 00:00:00 2001 From: Jeev B Date: Fri, 10 Nov 2023 11:14:40 -0800 Subject: [PATCH 40/43] Refactor task logs framework (#4396) * Refactor task logs framework Signed-off-by: Jeev B * Return templateLogPluginCollection instead of nil even if no plugins are specified Signed-off-by: Jeev B --------- Signed-off-by: Jeev B --- flyteplugins/go/tasks/logs/config.go | 25 +- flyteplugins/go/tasks/logs/logging_utils.go | 53 ++-- .../go/tasks/logs/logging_utils_test.go | 2 +- .../tasks/pluginmachinery/tasklog/plugin.go | 10 + .../tasks/pluginmachinery/tasklog/template.go | 57 +--- .../pluginmachinery/tasklog/template_test.go | 243 ++++++------------ 6 files changed, 128 insertions(+), 262 deletions(-) diff --git a/flyteplugins/go/tasks/logs/config.go b/flyteplugins/go/tasks/logs/config.go index 69ef17ed89..ca5a6012a8 100644 --- a/flyteplugins/go/tasks/logs/config.go +++ b/flyteplugins/go/tasks/logs/config.go @@ -1,45 +1,34 @@ package logs import ( - "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/config" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" ) //go:generate pflags LogConfig --default-var=DefaultConfig -// TemplateURI is a URI that accepts templates. See: go/tasks/pluginmachinery/tasklog/template.go for available templates. -type TemplateURI = string - // LogConfig encapsulates plugins' log configs type LogConfig struct { IsCloudwatchEnabled bool `json:"cloudwatch-enabled" pflag:",Enable Cloudwatch Logging"` // Deprecated: Please use CloudwatchTemplateURI CloudwatchRegion string `json:"cloudwatch-region" pflag:",AWS region in which Cloudwatch logs are stored."` // Deprecated: Please use CloudwatchTemplateURI - CloudwatchLogGroup string `json:"cloudwatch-log-group" pflag:",Log group to which streams are associated."` - CloudwatchTemplateURI TemplateURI `json:"cloudwatch-template-uri" pflag:",Template Uri to use when building cloudwatch log links"` + CloudwatchLogGroup string `json:"cloudwatch-log-group" pflag:",Log group to which streams are associated."` + CloudwatchTemplateURI tasklog.TemplateURI `json:"cloudwatch-template-uri" pflag:",Template Uri to use when building cloudwatch log links"` IsKubernetesEnabled bool `json:"kubernetes-enabled" pflag:",Enable Kubernetes Logging"` // Deprecated: Please use KubernetesTemplateURI - KubernetesURL string `json:"kubernetes-url" pflag:",Console URL for Kubernetes logs"` - KubernetesTemplateURI TemplateURI `json:"kubernetes-template-uri" pflag:",Template Uri to use when building kubernetes log links"` + KubernetesURL string `json:"kubernetes-url" pflag:",Console URL for Kubernetes logs"` + KubernetesTemplateURI tasklog.TemplateURI `json:"kubernetes-template-uri" pflag:",Template Uri to use when building kubernetes log links"` IsStackDriverEnabled bool `json:"stackdriver-enabled" pflag:",Enable Log-links to stackdriver"` // Deprecated: Please use StackDriverTemplateURI GCPProjectName string `json:"gcp-project" pflag:",Name of the project in GCP"` // Deprecated: Please use StackDriverTemplateURI - StackdriverLogResourceName string `json:"stackdriver-logresourcename" pflag:",Name of the logresource in stackdriver"` - StackDriverTemplateURI TemplateURI `json:"stackdriver-template-uri" pflag:",Template Uri to use when building stackdriver log links"` - - Templates []TemplateLogPluginConfig `json:"templates" pflag:"-,"` -} + StackdriverLogResourceName string `json:"stackdriver-logresourcename" pflag:",Name of the logresource in stackdriver"` + StackDriverTemplateURI tasklog.TemplateURI `json:"stackdriver-template-uri" pflag:",Template Uri to use when building stackdriver log links"` -type TemplateLogPluginConfig struct { - DisplayName string `json:"displayName" pflag:",Display name for the generated log when displayed in the console."` - TemplateURIs []TemplateURI `json:"templateUris" pflag:",URI Templates for generating task log links."` - MessageFormat core.TaskLog_MessageFormat `json:"messageFormat" pflag:",Log Message Format."` - Scheme tasklog.TemplateScheme `json:"scheme" pflag:",Templating scheme to use. Supported values are Pod and TaskExecution."` + Templates []tasklog.TemplateLogPlugin `json:"templates" pflag:"-,"` } var ( diff --git a/flyteplugins/go/tasks/logs/logging_utils.go b/flyteplugins/go/tasks/logs/logging_utils.go index 6af1889e9f..4978109458 100644 --- a/flyteplugins/go/tasks/logs/logging_utils.go +++ b/flyteplugins/go/tasks/logs/logging_utils.go @@ -13,11 +13,6 @@ import ( "github.com/flyteorg/flyte/flytestdlib/logger" ) -type logPlugin struct { - Name string - Plugin tasklog.Plugin -} - // Internal func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, taskExecID pluginsCore.TaskExecutionID, pod *v1.Pod, index uint32, nameSuffix string, extraLogTemplateVarsByScheme *tasklog.TemplateVarsByScheme) ([]*core.TaskLog, error) { if logPlugin == nil { @@ -66,67 +61,53 @@ func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, tas return logs.TaskLogs, nil } -type taskLogPluginWrapper struct { - logPlugins []logPlugin +type templateLogPluginCollection struct { + plugins []tasklog.TemplateLogPlugin } -func (t taskLogPluginWrapper) GetTaskLogs(input tasklog.Input) (logOutput tasklog.Output, err error) { - logs := make([]*core.TaskLog, 0, len(t.logPlugins)) - suffix := input.LogName +func (t templateLogPluginCollection) GetTaskLogs(input tasklog.Input) (tasklog.Output, error) { + var taskLogs []*core.TaskLog - for _, plugin := range t.logPlugins { - input.LogName = plugin.Name + suffix - o, err := plugin.Plugin.GetTaskLogs(input) + for _, plugin := range t.plugins { + o, err := plugin.GetTaskLogs(input) if err != nil { return tasklog.Output{}, err } - - logs = append(logs, o.TaskLogs...) + taskLogs = append(taskLogs, o.TaskLogs...) } - return tasklog.Output{ - TaskLogs: logs, - }, nil + return tasklog.Output{TaskLogs: taskLogs}, nil } // InitializeLogPlugins initializes log plugin based on config. func InitializeLogPlugins(cfg *LogConfig) (tasklog.Plugin, error) { // Use a list to maintain order. - logPlugins := make([]logPlugin, 0, 2) + var plugins []tasklog.TemplateLogPlugin if cfg.IsKubernetesEnabled { if len(cfg.KubernetesTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Kubernetes Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.KubernetesTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Kubernetes Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.KubernetesTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Kubernetes Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("%s/#!/log/{{ .namespace }}/{{ .podName }}/pod?namespace={{ .namespace }}", cfg.KubernetesURL)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Kubernetes Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("%s/#!/log/{{ .namespace }}/{{ .podName }}/pod?namespace={{ .namespace }}", cfg.KubernetesURL)}, MessageFormat: core.TaskLog_JSON}) } } if cfg.IsCloudwatchEnabled { if len(cfg.CloudwatchTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Cloudwatch Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.CloudwatchTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Cloudwatch Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.CloudwatchTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Cloudwatch Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("https://console.aws.amazon.com/cloudwatch/home?region=%s#logEventViewer:group=%s;stream=var.log.containers.{{ .podName }}_{{ .namespace }}_{{ .containerName }}-{{ .containerId }}.log", cfg.CloudwatchRegion, cfg.CloudwatchLogGroup)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Cloudwatch Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("https://console.aws.amazon.com/cloudwatch/home?region=%s#logEventViewer:group=%s;stream=var.log.containers.{{ .podName }}_{{ .namespace }}_{{ .containerName }}-{{ .containerId }}.log", cfg.CloudwatchRegion, cfg.CloudwatchLogGroup)}, MessageFormat: core.TaskLog_JSON}) } } if cfg.IsStackDriverEnabled { if len(cfg.StackDriverTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Stackdriver Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.StackDriverTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Stackdriver Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.StackDriverTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Stackdriver Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("https://console.cloud.google.com/logs/viewer?project=%s&angularJsUrl=%%2Flogs%%2Fviewer%%3Fproject%%3D%s&resource=%s&advancedFilter=resource.labels.pod_name%%3D{{ .podName }}", cfg.GCPProjectName, cfg.GCPProjectName, cfg.StackdriverLogResourceName)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Stackdriver Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("https://console.cloud.google.com/logs/viewer?project=%s&angularJsUrl=%%2Flogs%%2Fviewer%%3Fproject%%3D%s&resource=%s&advancedFilter=resource.labels.pod_name%%3D{{ .podName }}", cfg.GCPProjectName, cfg.GCPProjectName, cfg.StackdriverLogResourceName)}, MessageFormat: core.TaskLog_JSON}) } } - if len(cfg.Templates) > 0 { - for _, cfg := range cfg.Templates { - logPlugins = append(logPlugins, logPlugin{Name: cfg.DisplayName, Plugin: tasklog.NewTemplateLogPlugin(cfg.Scheme, cfg.TemplateURIs, cfg.MessageFormat)}) - } - } - - if len(logPlugins) == 0 { - return nil, nil - } - - return taskLogPluginWrapper{logPlugins: logPlugins}, nil + plugins = append(plugins, cfg.Templates...) + return templateLogPluginCollection{plugins: plugins}, nil } diff --git a/flyteplugins/go/tasks/logs/logging_utils_test.go b/flyteplugins/go/tasks/logs/logging_utils_test.go index 066fdd96c8..91048eff16 100644 --- a/flyteplugins/go/tasks/logs/logging_utils_test.go +++ b/flyteplugins/go/tasks/logs/logging_utils_test.go @@ -320,7 +320,7 @@ func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*c func TestGetLogsForContainerInPod_Templates(t *testing.T) { assertTestSucceeded(t, &LogConfig{ - Templates: []TemplateLogPluginConfig{ + Templates: []tasklog.TemplateLogPlugin{ { DisplayName: "StackDriver", TemplateURIs: []string{ diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go index b812221f6d..c43da02e58 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go @@ -16,6 +16,9 @@ const ( TemplateSchemeTaskExecution ) +// TemplateURI is a URI that accepts templates. See: go/tasks/pluginmachinery/tasklog/template.go for available templates. +type TemplateURI = string + type TemplateVar struct { Regex *regexp.Regexp Value string @@ -57,3 +60,10 @@ type Plugin interface { // Generates a TaskLog object given necessary computation information GetTaskLogs(i Input) (logs Output, err error) } + +type TemplateLogPlugin struct { + DisplayName string `json:"displayName" pflag:",Display name for the generated log when displayed in the console."` + TemplateURIs []TemplateURI `json:"templateUris" pflag:",URI Templates for generating task log links."` + MessageFormat core.TaskLog_MessageFormat `json:"messageFormat" pflag:",Log Message Format."` + Scheme TemplateScheme `json:"scheme" pflag:",Templating scheme to use. Supported values are Pod and TaskExecution."` +} diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go index 77c49d2695..750b1972df 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go @@ -34,6 +34,7 @@ type templateRegexes struct { ExecutionName *regexp.Regexp ExecutionProject *regexp.Regexp ExecutionDomain *regexp.Regexp + GeneratedName *regexp.Regexp } func initDefaultRegexes() templateRegexes { @@ -58,6 +59,7 @@ func initDefaultRegexes() templateRegexes { MustCreateRegex("executionName"), MustCreateRegex("executionProject"), MustCreateRegex("executionDomain"), + MustCreateRegex("generatedName"), } } @@ -121,6 +123,10 @@ func (input Input) templateVarsForScheme(scheme TemplateScheme) TemplateVars { defaultRegexes.NodeID, input.TaskExecutionID.GetUniqueNodeID(), }, + TemplateVar{ + defaultRegexes.GeneratedName, + input.TaskExecutionID.GetGeneratedName(), + }, TemplateVar{ defaultRegexes.TaskRetryAttempt, strconv.FormatUint(uint64(taskExecutionIdentifier.RetryAttempt), 10), @@ -172,55 +178,16 @@ func (input Input) templateVarsForScheme(scheme TemplateScheme) TemplateVars { return vars } -// A simple log plugin that supports templates in urls to build the final log link. -// See `defaultRegexes` for supported templates. -type TemplateLogPlugin struct { - scheme TemplateScheme - templateUris []string - messageFormat core.TaskLog_MessageFormat -} - -func (s TemplateLogPlugin) GetTaskLog(podName, podUID, namespace, containerName, containerID, logName string, podRFC3339StartTime string, podRFC3339FinishTime string, podUnixStartTime, podUnixFinishTime int64) (core.TaskLog, error) { - o, err := s.GetTaskLogs(Input{ - LogName: logName, - Namespace: namespace, - PodName: podName, - PodUID: podUID, - ContainerName: containerName, - ContainerID: containerID, - PodRFC3339StartTime: podRFC3339StartTime, - PodRFC3339FinishTime: podRFC3339FinishTime, - PodUnixStartTime: podUnixStartTime, - PodUnixFinishTime: podUnixFinishTime, - }) - - if err != nil || len(o.TaskLogs) == 0 { - return core.TaskLog{}, err - } - - return *o.TaskLogs[0], nil -} - -func (s TemplateLogPlugin) GetTaskLogs(input Input) (Output, error) { - templateVars := input.templateVarsForScheme(s.scheme) - taskLogs := make([]*core.TaskLog, 0, len(s.templateUris)) - for _, templateURI := range s.templateUris { +func (p TemplateLogPlugin) GetTaskLogs(input Input) (Output, error) { + templateVars := input.templateVarsForScheme(p.Scheme) + taskLogs := make([]*core.TaskLog, 0, len(p.TemplateURIs)) + for _, templateURI := range p.TemplateURIs { taskLogs = append(taskLogs, &core.TaskLog{ Uri: replaceAll(templateURI, templateVars), - Name: input.LogName, - MessageFormat: s.messageFormat, + Name: p.DisplayName + input.LogName, + MessageFormat: p.MessageFormat, }) } return Output{TaskLogs: taskLogs}, nil } - -// NewTemplateLogPlugin creates a template-based log plugin with the provided template Uri and message format. -// See `defaultRegexes` for supported templates. -func NewTemplateLogPlugin(scheme TemplateScheme, templateUris []string, messageFormat core.TaskLog_MessageFormat) TemplateLogPlugin { - return TemplateLogPlugin{ - scheme: scheme, - templateUris: templateUris, - messageFormat: messageFormat, - } -} diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go index 320ece05a4..f279707a3b 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go @@ -5,7 +5,6 @@ import ( "regexp" "testing" - "github.com/go-test/deep" "github.com/stretchr/testify/assert" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" @@ -13,26 +12,6 @@ import ( coreMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core/mocks" ) -func TestTemplateLog(t *testing.T) { - p := NewTemplateLogPlugin(TemplateSchemePod, []string{"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.podUID}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log"}, core.TaskLog_JSON) - tl, err := p.GetTaskLog( - "f-uuid-driver", - "pod-uid", - "flyteexamples-production", - "spark-kubernetes-driver", - "cri-o://abc", - "main_logs", - "2015-03-14T17:08:14+01:00", - "2021-06-15T20:47:57+02:00", - 1426349294, - 1623782877, - ) - assert.NoError(t, err) - assert.Equal(t, tl.GetName(), "main_logs") - assert.Equal(t, tl.GetMessageFormat(), core.TaskLog_JSON) - assert.Equal(t, "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.f-uuid-driver_pod-uid_flyteexamples-production_spark-kubernetes-driver-abc.log", tl.Uri) -} - // Latest Run: Benchmark_mustInitTemplateRegexes-16 45960 26914 ns/op func Benchmark_initDefaultRegexes(b *testing.B) { for i := 0; i < b.N; i++ { @@ -172,6 +151,7 @@ func Test_Input_templateVarsForScheme(t *testing.T) { TemplateVars{ {defaultRegexes.LogName, "main_logs"}, {defaultRegexes.NodeID, "n0-0-n0"}, + {defaultRegexes.GeneratedName, "generated-name"}, {defaultRegexes.TaskRetryAttempt, "1"}, {defaultRegexes.TaskID, "my-task-name"}, {defaultRegexes.TaskVersion, "1"}, @@ -245,147 +225,99 @@ func Test_Input_templateVarsForScheme(t *testing.T) { } } -func Test_templateLogPlugin_Regression(t *testing.T) { - type fields struct { - templateURI string - messageFormat core.TaskLog_MessageFormat - } +func TestTemplateLogPlugin(t *testing.T) { type args struct { - podName string - podUID string - namespace string - containerName string - containerID string - logName string - podRFC3339StartTime string - podRFC3339FinishTime string - podUnixStartTime int64 - podUnixFinishTime int64 + input Input } tests := []struct { - name string - fields fields - args args - want core.TaskLog - wantErr bool + name string + plugin TemplateLogPlugin + args args + want Output }{ { "cloudwatch", - fields{ - templateURI: "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "f-uuid-driver", - podUID: "pod-uid", - namespace: "flyteexamples-production", - containerName: "spark-kubernetes-driver", - containerID: "cri-o://abc", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "f-uuid-driver", + PodUID: "pod-uid", + Namespace: "flyteexamples-production", + ContainerName: "spark-kubernetes-driver", + ContainerID: "cri-o://abc", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.f-uuid-driver_flyteexamples-production_spark-kubernetes-driver-abc.log", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, { "stackdriver", - fields{ - templateURI: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "podName", - podUID: "pod-uid", - namespace: "flyteexamples-production", - containerName: "spark-kubernetes-driver", - containerID: "cri-o://abc", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "podName", + PodUID: "pod-uid", + Namespace: "flyteexamples-production", + ContainerName: "spark-kubernetes-driver", + ContainerID: "cri-o://abc", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3DpodName", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, { "kubernetes", - fields{ - templateURI: "https://dashboard.k8s.net/#!/log/{{.namespace}}/{{.podName}}/pod?namespace={{.namespace}}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://dashboard.k8s.net/#!/log/{{.namespace}}/{{.podName}}/pod?namespace={{.namespace}}"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "flyteexamples-development-task-name", - podUID: "pod-uid", - namespace: "flyteexamples-development", - containerName: "ignore", - containerID: "ignore", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "flyteexamples-development-task-name", + PodUID: "pod-uid", + Namespace: "flyteexamples-development", + ContainerName: "ignore", + ContainerID: "ignore", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://dashboard.k8s.net/#!/log/flyteexamples-development/flyteexamples-development-task-name/pod?namespace=flyteexamples-development", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - s := TemplateLogPlugin{ - templateUris: []string{tt.fields.templateURI}, - messageFormat: tt.fields.messageFormat, - } - - got, err := s.GetTaskLog(tt.args.podName, tt.args.podUID, tt.args.namespace, tt.args.containerName, tt.args.containerID, tt.args.logName, tt.args.podRFC3339FinishTime, tt.args.podRFC3339FinishTime, tt.args.podUnixStartTime, tt.args.podUnixFinishTime) - if (err != nil) != tt.wantErr { - t.Errorf("GetTaskLog() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if diff := deep.Equal(got, tt.want); diff != nil { - t.Errorf("GetTaskLog() got = %v, want %v, diff: %v", got, tt.want, diff) - } - }) - } -} - -func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { - type fields struct { - scheme TemplateScheme - templateURI string - messageFormat core.TaskLog_MessageFormat - } - type args struct { - input Input - } - tests := []struct { - name string - fields fields - args args - want Output - wantErr bool - }{ { "splunk", - fields{ - templateURI: "https://prd-p-ighar.splunkcloud.com/en-US/app/search/search?q=search%20container_name%3D%22{{ .containerName }}%22", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://prd-p-ighar.splunkcloud.com/en-US/app/search/search?q=search%20container_name%3D%22{{ .containerName }}%22"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -410,13 +342,12 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "ddog", - fields{ - templateURI: "https://app.datadoghq.com/logs?event&from_ts={{ .podUnixStartTime }}&live=true&query=pod_name%3A{{ .podName }}&to_ts={{ .podUnixFinishTime }}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://app.datadoghq.com/logs?event&from_ts={{ .podUnixStartTime }}&live=true&query=pod_name%3A{{ .podName }}&to_ts={{ .podUnixFinishTime }}"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -441,13 +372,12 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "stackdriver-with-rfc3339-timestamp", - fields{ - templateURI: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}%20%22{{.podRFC3339StartTime}}%22", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}%20%22{{.podRFC3339StartTime}}%22"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -472,14 +402,13 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "task-with-task-execution-identifier", - fields{ - scheme: TemplateSchemeTaskExecution, - templateURI: "https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .taskRetryAttempt }}/view/logs", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + Scheme: TemplateSchemeTaskExecution, + TemplateURIs: []TemplateURI{"https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .taskRetryAttempt }}/view/logs"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -505,14 +434,13 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "mapped-task-with-task-execution-identifier", - fields{ - scheme: TemplateSchemeTaskExecution, - templateURI: "https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .subtaskParentRetryAttempt }}/mappedIndex/{{ .subtaskExecutionIndex }}/mappedAttempt/{{ .subtaskRetryAttempt }}/view/logs", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + Scheme: TemplateSchemeTaskExecution, + TemplateURIs: []TemplateURI{"https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .subtaskParentRetryAttempt }}/mappedIndex/{{ .subtaskExecutionIndex }}/mappedAttempt/{{ .subtaskRetryAttempt }}/view/logs"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -545,23 +473,14 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - s := TemplateLogPlugin{ - scheme: tt.fields.scheme, - templateUris: []string{tt.fields.templateURI}, - messageFormat: tt.fields.messageFormat, - } - got, err := s.GetTaskLogs(tt.args.input) - if (err != nil) != tt.wantErr { - t.Errorf("NewTaskLog() error = %v, wantErr %v", err, tt.wantErr) - return - } + got, err := tt.plugin.GetTaskLogs(tt.args.input) + assert.NoError(t, err) if !reflect.DeepEqual(got, tt.want) { - t.Errorf("NewTaskLog() got = %v, want %v", got, tt.want) + t.Errorf("GetTaskLogs() got = %v, want %v", got, tt.want) } }) } From 70c23c2cf8dccd44011c2168e7f532735ce8b99c Mon Sep 17 00:00:00 2001 From: Jeev B Date: Fri, 10 Nov 2023 11:42:52 -0800 Subject: [PATCH 41/43] Add support for displaying the Ray dashboard when a RayJob is active (#4397) * Refactor task logs framework Signed-off-by: Jeev B * Return templateLogPluginCollection instead of nil even if no plugins are specified Signed-off-by: Jeev B * Add support for displaying the Ray dashboard when a RayJob is active Signed-off-by: Jeev B * Fix tasklogs returned for Ray task Signed-off-by: Jeev B * Get tasklogs working with task phase Signed-off-by: Jeev B * Misc fixes Signed-off-by: Jeev B * Add tests for dashboard URL link Signed-off-by: Jeev B * Fix linting issues and merge conflicts Signed-off-by: Jeev B --------- Signed-off-by: Jeev B --- .../go/tasks/plugins/k8s/ray/config.go | 12 +-- flyteplugins/go/tasks/plugins/k8s/ray/ray.go | 40 ++++++--- .../go/tasks/plugins/k8s/ray/ray_test.go | 84 ++++++++++++++++--- 3 files changed, 108 insertions(+), 28 deletions(-) diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/config.go b/flyteplugins/go/tasks/plugins/k8s/ray/config.go index e123c5b8ab..8601264edf 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/config.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/config.go @@ -8,6 +8,7 @@ import ( pluginsConfig "github.com/flyteorg/flyte/flyteplugins/go/tasks/config" "github.com/flyteorg/flyte/flyteplugins/go/tasks/logs" pluginmachinery "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s" + "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flytestdlib/config" ) @@ -78,11 +79,12 @@ type Config struct { DeprecatedNodeIPAddress string `json:"nodeIPAddress,omitempty" pflag:"-,DEPRECATED. Please use DefaultConfig.[HeadNode|WorkerNode].IPAddress"` // Remote Ray Cluster Config - RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"` - Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"` - LogsSidecar *v1.Container `json:"logsSidecar" pflag:"-,Sidecar to inject into head pods for capturing ray job logs"` - Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"` - EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"` + RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"` + Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"` + LogsSidecar *v1.Container `json:"logsSidecar" pflag:"-,Sidecar to inject into head pods for capturing ray job logs"` + DashboardURLTemplate *tasklog.TemplateLogPlugin `json:"dashboardURLTemplate" pflag:",Template for URL of Ray dashboard running on a head node."` + Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"` + EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"` } type DefaultConfig struct { diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go index 1d0fde4ca8..0bc4f1183b 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go @@ -13,6 +13,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/plugins" flyteerr "github.com/flyteorg/flyte/flyteplugins/go/tasks/errors" "github.com/flyteorg/flyte/flyteplugins/go/tasks/logs" @@ -437,26 +438,35 @@ func getEventInfoForRayJob(logConfig logs.LogConfig, pluginContext k8s.PluginCon return nil, fmt.Errorf("failed to initialize log plugins. Error: %w", err) } - if logPlugin == nil { - return nil, nil - } - - // TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs - // RayJob CRD does not include the name of the worker or head pod for now + var taskLogs []*core.TaskLog taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() - logOutput, err := logPlugin.GetTaskLogs(tasklog.Input{ + input := tasklog.Input{ Namespace: rayJob.Namespace, TaskExecutionID: taskExecID, - }) + } + // TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs + // RayJob CRD does not include the name of the worker or head pod for now + logOutput, err := logPlugin.GetTaskLogs(input) if err != nil { return nil, fmt.Errorf("failed to generate task logs. Error: %w", err) } + taskLogs = append(taskLogs, logOutput.TaskLogs...) - return &pluginsCore.TaskInfo{ - Logs: logOutput.TaskLogs, - }, nil + // Handling for Ray Dashboard + dashboardURLTemplate := GetConfig().DashboardURLTemplate + if dashboardURLTemplate != nil && + rayJob.Status.DashboardURL != "" && + rayJob.Status.JobStatus == rayv1alpha1.JobStatusRunning { + dashboardURLOutput, err := dashboardURLTemplate.GetTaskLogs(input) + if err != nil { + return nil, fmt.Errorf("failed to generate Ray dashboard link. Error: %w", err) + } + taskLogs = append(taskLogs, dashboardURLOutput.TaskLogs...) + } + + return &pluginsCore.TaskInfo{Logs: taskLogs}, nil } func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) { @@ -489,8 +499,14 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil case rayv1alpha1.JobStatusSucceeded: return pluginsCore.PhaseInfoSuccess(info), nil - case rayv1alpha1.JobStatusPending, rayv1alpha1.JobStatusRunning: + case rayv1alpha1.JobStatusPending: return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil + case rayv1alpha1.JobStatusRunning: + phaseInfo := pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info) + if len(info.Logs) > 0 { + phaseInfo = phaseInfo.WithVersion(pluginsCore.DefaultPhaseVersion + 1) + } + return phaseInfo, nil case rayv1alpha1.JobStatusStopped: // There is no current usage of this job status in KubeRay. It's unclear what it represents fallthrough diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go index 920fa85d61..ccb518fa03 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go @@ -24,6 +24,7 @@ import ( pluginIOMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/io/mocks" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s" mocks2 "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s/mocks" + "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/utils" ) @@ -615,6 +616,8 @@ func newPluginContext() k8s.PluginContext { }, }, }) + taskExecID.OnGetUniqueNodeID().Return("unique-node") + taskExecID.OnGetGeneratedName().Return("generated-name") tskCtx := &mocks.TaskExecutionMetadata{} tskCtx.OnGetTaskExecutionID().Return(taskExecID) @@ -642,17 +645,19 @@ func TestGetTaskPhase(t *testing.T) { rayJobPhase rayv1alpha1.JobStatus rayClusterPhase rayv1alpha1.JobDeploymentStatus expectedCorePhase pluginsCore.Phase + expectedError bool }{ - {"", rayv1alpha1.JobDeploymentStatusInitializing, pluginsCore.PhaseInitializing}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetOrCreateRayCluster, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusWaitForDashboard, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedJobDeploy, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus, pluginsCore.PhaseUndefined}, - {rayv1alpha1.JobStatusRunning, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusFailed, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseSuccess}, - {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseSuccess}, + {"", rayv1alpha1.JobDeploymentStatusInitializing, pluginsCore.PhaseInitializing, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetOrCreateRayCluster, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusWaitForDashboard, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedJobDeploy, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusRunning, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusFailed, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseSuccess, false}, + {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseSuccess, false}, + {rayv1alpha1.JobStatusStopped, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseUndefined, true}, } for _, tc := range testCases { @@ -663,12 +668,69 @@ func TestGetTaskPhase(t *testing.T) { startTime := metav1.NewTime(time.Now()) rayObject.Status.StartTime = &startTime phaseInfo, err := rayJobResourceHandler.GetTaskPhase(ctx, pluginCtx, rayObject) - assert.Nil(t, err) + if tc.expectedError { + assert.Error(t, err) + } else { + assert.Nil(t, err) + } assert.Equal(t, tc.expectedCorePhase.String(), phaseInfo.Phase().String()) }) } } +func TestGetEventInfo_DashboardURL(t *testing.T) { + pluginCtx := newPluginContext() + testCases := []struct { + name string + rayJob rayv1alpha1.RayJob + dashboardURLTemplate tasklog.TemplateLogPlugin + expectedTaskLogs []*core.TaskLog + }{ + { + name: "dashboard URL displayed", + rayJob: rayv1alpha1.RayJob{ + Status: rayv1alpha1.RayJobStatus{ + DashboardURL: "exists", + JobStatus: rayv1alpha1.JobStatusRunning, + }, + }, + dashboardURLTemplate: tasklog.TemplateLogPlugin{ + DisplayName: "Ray Dashboard", + TemplateURIs: []tasklog.TemplateURI{"http://test/{{.generatedName}}"}, + Scheme: tasklog.TemplateSchemeTaskExecution, + }, + expectedTaskLogs: []*core.TaskLog{ + { + Name: "Ray Dashboard", + Uri: "http://test/generated-name", + }, + }, + }, + { + name: "dashboard URL is not displayed", + rayJob: rayv1alpha1.RayJob{ + Status: rayv1alpha1.RayJobStatus{ + JobStatus: rayv1alpha1.JobStatusPending, + }, + }, + dashboardURLTemplate: tasklog.TemplateLogPlugin{ + DisplayName: "dummy", + TemplateURIs: []tasklog.TemplateURI{"http://dummy"}, + }, + expectedTaskLogs: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.NoError(t, SetConfig(&Config{DashboardURLTemplate: &tc.dashboardURLTemplate})) + ti, err := getEventInfoForRayJob(logs.LogConfig{}, pluginCtx, &tc.rayJob) + assert.NoError(t, err) + assert.Equal(t, tc.expectedTaskLogs, ti.Logs) + }) + } +} + func TestGetPropertiesRay(t *testing.T) { rayJobResourceHandler := rayJobResourceHandler{} expected := k8s.PluginProperties{} From c4b040b8b33f1558e9169c751cbff3916e1f1b7e Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario <653394+eapolinario@users.noreply.github.com> Date: Fri, 10 Nov 2023 13:12:04 -0800 Subject: [PATCH 42/43] Disable path filtering for monorepo components (#4404) Signed-off-by: Eduardo Apolinario Co-authored-by: Eduardo Apolinario --- .github/workflows/checks.yml | 15 --------------- .github/workflows/flyteidl-checks.yml | 4 ---- 2 files changed, 19 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 8c235d3d71..41ce00d103 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,24 +6,9 @@ concurrency: on: pull_request: - paths: - - 'datacatalog/**' - - 'flyteadmin/**' - - 'flytecopilot/**' - - 'flyteplugins/**' - - 'flytepropeller/**' - - 'flytestdlib/**' push: branches: - master - paths: - - 'datacatalog/**' - - 'flyteadmin/**' - - 'flytecopilot/**' - - 'flyteidl/**' - - 'flyteplugins/**' - - 'flytepropeller/**' - - 'flytestdlib/**' env: GO_VERSION: "1.19" PRIORITIES: "P0" diff --git a/.github/workflows/flyteidl-checks.yml b/.github/workflows/flyteidl-checks.yml index cbf84b97f1..5cdf9733b2 100644 --- a/.github/workflows/flyteidl-checks.yml +++ b/.github/workflows/flyteidl-checks.yml @@ -6,13 +6,9 @@ concurrency: on: pull_request: - paths: - - 'flyteidl/**' push: branches: - master - paths: - - 'flyteidl/**' env: GO_VERSION: "1.19" jobs: From 21b92f41652f09631cd939769eca5b7bd7d992ac Mon Sep 17 00:00:00 2001 From: Honnix Date: Sat, 11 Nov 2023 01:14:57 +0100 Subject: [PATCH 43/43] Silence NotFound when get task resource (#4388) * Silence NotFound when get task resource Signed-off-by: Hongxin Liang * Fix more places Signed-off-by: Hongxin Liang * Fix imports Signed-off-by: Hongxin Liang --------- Signed-off-by: Hongxin Liang Co-authored-by: Dan Rammer --- flyteadmin/pkg/errors/errors.go | 5 +++++ flyteadmin/pkg/errors/errors_test.go | 13 +++++++++++++ .../impl/random_cluster_selector.go | 8 ++------ flyteadmin/pkg/manager/impl/execution_manager.go | 15 +++++---------- .../manager/impl/executions/quality_of_service.go | 2 +- flyteadmin/pkg/manager/impl/executions/queues.go | 3 ++- flyteadmin/pkg/manager/impl/util/resources.go | 3 ++- flyteadmin/pkg/manager/impl/util/shared.go | 10 ++++------ 8 files changed, 34 insertions(+), 25 deletions(-) diff --git a/flyteadmin/pkg/errors/errors.go b/flyteadmin/pkg/errors/errors.go index 42ef63841b..e269715a91 100644 --- a/flyteadmin/pkg/errors/errors.go +++ b/flyteadmin/pkg/errors/errors.go @@ -140,3 +140,8 @@ func NewWorkflowExistsIdenticalStructureError(ctx context.Context, request *admi } return statusErr } + +func IsDoesNotExistError(err error) bool { + adminError, ok := err.(FlyteAdminError) + return ok && adminError.Code() == codes.NotFound +} diff --git a/flyteadmin/pkg/errors/errors_test.go b/flyteadmin/pkg/errors/errors_test.go index 4b6b250166..cb6a2a0ae0 100644 --- a/flyteadmin/pkg/errors/errors_test.go +++ b/flyteadmin/pkg/errors/errors_test.go @@ -2,6 +2,7 @@ package errors import ( "context" + "errors" "testing" "github.com/stretchr/testify/assert" @@ -90,3 +91,15 @@ func TestNewWorkflowExistsIdenticalStructureError(t *testing.T) { _, ok = details.GetReason().(*admin.CreateWorkflowFailureReason_ExistsIdenticalStructure) assert.True(t, ok) } + +func TestIsDoesNotExistError(t *testing.T) { + assert.True(t, IsDoesNotExistError(NewFlyteAdminError(codes.NotFound, "foo"))) +} + +func TestIsNotDoesNotExistError(t *testing.T) { + assert.False(t, IsDoesNotExistError(NewFlyteAdminError(codes.Canceled, "foo"))) +} + +func TestIsNotDoesNotExistErrorBecauseOfNoneAdminError(t *testing.T) { + assert.False(t, IsDoesNotExistError(errors.New("foo"))) +} diff --git a/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go b/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go index 712470a1ee..74dad42ce0 100644 --- a/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go +++ b/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go @@ -6,8 +6,6 @@ import ( "hash/fnv" "math/rand" - "google.golang.org/grpc/codes" - "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/executioncluster" "github.com/flyteorg/flyte/flyteadmin/pkg/executioncluster/interfaces" @@ -102,10 +100,8 @@ func (s RandomClusterSelector) GetTarget(ctx context.Context, spec *executionclu LaunchPlan: spec.LaunchPlan, ResourceType: admin.MatchableResource_EXECUTION_CLUSTER_LABEL, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + return nil, err } var weightedRandomList random.WeightedRandomList diff --git a/flyteadmin/pkg/manager/impl/execution_manager.go b/flyteadmin/pkg/manager/impl/execution_manager.go index e31f501686..06516657d2 100644 --- a/flyteadmin/pkg/manager/impl/execution_manager.go +++ b/flyteadmin/pkg/manager/impl/execution_manager.go @@ -164,11 +164,8 @@ func (m *ExecutionManager) addPluginOverrides(ctx context.Context, executionID * LaunchPlan: launchPlanName, ResourceType: admin.MatchableResource_PLUGIN_OVERRIDE, }) - if err != nil { - ec, ok := err.(errors.FlyteAdminError) - if !ok || ec.Code() != codes.NotFound { - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + return nil, err } if override != nil && override.Attributes != nil && override.Attributes.GetPluginOverrides() != nil { return override.Attributes.GetPluginOverrides().Overrides, nil @@ -427,11 +424,9 @@ func (m *ExecutionManager) getClusterAssignment(ctx context.Context, request *ad Domain: request.Domain, ResourceType: admin.MatchableResource_CLUSTER_ASSIGNMENT, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - logger.Errorf(ctx, "Failed to get cluster assignment overrides with error: %v", err) - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + logger.Errorf(ctx, "Failed to get cluster assignment overrides with error: %v", err) + return nil, err } if resource != nil && resource.Attributes.GetClusterAssignment() != nil { return resource.Attributes.GetClusterAssignment(), nil diff --git a/flyteadmin/pkg/manager/impl/executions/quality_of_service.go b/flyteadmin/pkg/manager/impl/executions/quality_of_service.go index 345b0e926d..a96d99d3d6 100644 --- a/flyteadmin/pkg/manager/impl/executions/quality_of_service.go +++ b/flyteadmin/pkg/manager/impl/executions/quality_of_service.go @@ -43,7 +43,7 @@ func (q qualityOfServiceAllocator) getQualityOfServiceFromDb(ctx context.Context ResourceType: admin.MatchableResource_QUALITY_OF_SERVICE_SPECIFICATION, }) if err != nil { - if _, ok := err.(errors.FlyteAdminError); !ok || err.(errors.FlyteAdminError).Code() != codes.NotFound { + if !errors.IsDoesNotExistError(err) { logger.Warningf(ctx, "Failed to fetch override values when assigning quality of service values for [%+v] with err: %v", workflowIdentifier, err) diff --git a/flyteadmin/pkg/manager/impl/executions/queues.go b/flyteadmin/pkg/manager/impl/executions/queues.go index 9ec4ff865a..5e4706700c 100644 --- a/flyteadmin/pkg/manager/impl/executions/queues.go +++ b/flyteadmin/pkg/manager/impl/executions/queues.go @@ -4,6 +4,7 @@ import ( "context" "math/rand" + "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/impl/resources" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/interfaces" repoInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/repositories/interfaces" @@ -64,7 +65,7 @@ func (q *queueAllocatorImpl) GetQueue(ctx context.Context, identifier core.Ident ResourceType: admin.MatchableResource_EXECUTION_QUEUE, }) - if err != nil { + if err != nil && !errors.IsDoesNotExistError(err) { logger.Warningf(ctx, "Failed to fetch override values when assigning execution queue for [%+v] with err: %v", identifier, err) } diff --git a/flyteadmin/pkg/manager/impl/util/resources.go b/flyteadmin/pkg/manager/impl/util/resources.go index 0180484005..8001b907dd 100644 --- a/flyteadmin/pkg/manager/impl/util/resources.go +++ b/flyteadmin/pkg/manager/impl/util/resources.go @@ -6,6 +6,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" + "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/interfaces" runtimeInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/runtime/interfaces" workflowengineInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/workflowengine/interfaces" @@ -100,7 +101,7 @@ func GetTaskResources(ctx context.Context, id *core.Identifier, resourceManager } resource, err := resourceManager.GetResource(ctx, request) - if err != nil { + if err != nil && !errors.IsDoesNotExistError(err) { logger.Infof(ctx, "Failed to fetch override values when assigning task resource default values for [%+v]: %v", id, err) } diff --git a/flyteadmin/pkg/manager/impl/util/shared.go b/flyteadmin/pkg/manager/impl/util/shared.go index cf7febd619..24c97f416a 100644 --- a/flyteadmin/pkg/manager/impl/util/shared.go +++ b/flyteadmin/pkg/manager/impl/util/shared.go @@ -275,12 +275,10 @@ func GetMatchableResource(ctx context.Context, resourceManager interfaces.Resour Workflow: workflowName, ResourceType: resourceType, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - logger.Errorf(ctx, "Failed to get %v overrides in %s project %s domain %s workflow with error: %v", resourceType, - project, domain, workflowName, err) - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + logger.Errorf(ctx, "Failed to get %v overrides in %s project %s domain %s workflow with error: %v", resourceType, + project, domain, workflowName, err) + return nil, err } return matchableResource, nil }