diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7e867ae713..13d4ff7668 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,6 +12,10 @@ Happy contributing! +## Docs link + + + ## Describe your changes @@ -29,6 +33,10 @@ Happy contributing! - [ ] All new and existing tests passed. - [ ] All commits are signed-off. +## Setup Process + + + ## Screenshots @@ -36,3 +44,8 @@ Happy contributing! ## Note to reviewers + +## Related PRs + + + diff --git a/.github/workflows/bump-tags.yml b/.github/workflows/bump-tags.yml deleted file mode 100644 index 2ea6cf0b5a..0000000000 --- a/.github/workflows/bump-tags.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: Bump git tags - -on: - workflow_call: - outputs: - next_version: - description: "Tag name" - value: ${{ jobs.bump_version.outputs.next_version }} - secrets: - FLYTE_BOT_PAT: - required: true -jobs: - bump_version: - name: Bump Version - runs-on: ubuntu-latest - outputs: - next_version: ${{ steps.get_semver.outputs.next_version }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: '0' - # Bump patch version - - uses: rickstaa/action-get-semver@v1 - id: get_semver - with: - bump_level: "patch" - - name: Print current and next versions - run: | - echo "Current version: ${{ steps.get_semver.outputs.current_version }}" - echo "Next version: ${{ steps.get_semver.outputs.next_version }}" - # Generate all tags for all components - - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.FLYTE_BOT_PAT }} - script: | - github.rest.git.createRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: `refs/tags/${{ steps.get_semver.outputs.next_version }}`, - sha: context.sha - }) - const components = [ - "datacatalog", - "flyteadmin", - "flytecopilot", - "flyteidl", - "flyteplugins", - "flytepropeller", - "flytestdlib", - ]; - for (const c of components) { - github.rest.git.createRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: `refs/tags/${c}/${{ steps.get_semver.outputs.next_version }}`, - sha: context.sha - }) - } diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 34b6ca4781..41ce00d103 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,24 +6,9 @@ concurrency: on: pull_request: - paths: - - 'datacatalog/**' - - 'flyteadmin/**' - - 'flytecopilot/**' - - 'flyteplugins/**' - - 'flytepropeller/**' - - 'flytestdlib/**' push: branches: - master - paths: - - 'datacatalog/**' - - 'flyteadmin/**' - - 'flytecopilot/**' - - 'flyteidl/**' - - 'flyteplugins/**' - - 'flytepropeller/**' - - 'flytestdlib/**' env: GO_VERSION: "1.19" PRIORITIES: "P0" @@ -134,47 +119,11 @@ jobs: component: ${{ matrix.component }} go-version: ${{ needs.unpack-envvars.outputs.go-version }} - bump-tags: - name: Bump git tags - # TODO(monorepo): skip this if author is flyte-bot? - if: ${{ github.event_name != 'pull_request' }} - needs: - - integration - - lint - - unit-tests - - generate - uses: ./.github/workflows/bump-tags.yml - secrets: - FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} - - # TODO(monorepo): we are not going to release any binary - # goreleaser: - # name: Goreleaser - # needs: [ bump_version ] # Only to ensure it can successfully build - # uses: flyteorg/flytetools/.github/workflows/goreleaser.yml@master - # with: - # go-version: "1.19" - # secrets: - # FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} - - push_docker_image: - name: Build & Push Image - needs: [ bump-tags ] - strategy: - fail-fast: false - matrix: - component: - - datacatalog - - flyteadmin - - flytecopilot - - flytepropeller - - flytescheduler - uses: ./.github/workflows/publish.yml + build_docker_images: + name: Build Images + uses: ./.github/workflows/publish-images.yml with: - version: ${{ needs.bump-tags.outputs.next_version }} - component: ${{ matrix.component }} - dockerfile: Dockerfile.${{ matrix.component }} - push: true + push: false secrets: FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 9f807887c2..0323f1d971 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -7,29 +7,57 @@ on: required: true jobs: - bump-version: - name: bump-version + generate-tags: + name: Generate git tags runs-on: ubuntu-latest - outputs: - version: ${{ steps.bump-version.outputs.tag }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: - fetch-depth: "0" + fetch-depth: '0' + - uses: actions/github-script@v6 + with: + github-token: ${{ secrets.FLYTE_BOT_PAT }} + script: | + github.rest.git.createRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `refs/tags/${{ github.event.inputs.version }}`, + sha: context.sha + }) + const components = [ + "datacatalog", + "flyteadmin", + "flytecopilot", + "flyteidl", + "flyteplugins", + "flytepropeller", + "flytestdlib", + ]; + for (const c of components) { + github.rest.git.createRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `refs/tags/${c}/${{ github.event.inputs.version }}`, + sha: context.sha + }) + } - - name: Bump version and push tag - id: bump-version - uses: anothrNick/github-tag-action@1.36.0 - env: - GITHUB_TOKEN: ${{ secrets.FLYTE_BOT_PAT }} - WITH_V: true - CUSTOM_TAG: ${{ github.event.inputs.version }} - RELEASE_BRANCHES: master + build-docker-images: + needs: + - generate-tags + uses: ./.github/workflows/publish-images.yml + with: + version: ${{ github.event.inputs.version }} + push: true + secrets: + FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} + FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} publish-flyte-binary-image: name: Publish flyte binary image for the release version runs-on: ubuntu-latest - needs: [bump-version] + needs: + - generate-tags steps: - name: Checkout uses: actions/checkout@v2 @@ -45,24 +73,25 @@ jobs: - name: Tag image to release version run: | - for release in latest ${{ needs.bump-version.outputs.version }}; do + for release in latest ${{ github.event.inputs.version }}; do docker buildx imagetools create --tag "ghcr.io/${{ github.repository_owner }}/flyte-binary-release:${release}" "ghcr.io/${{ github.repository_owner }}/flyte-binary:sha-${{ github.sha }}" done publish-flyte-component-image: name: Publish flyte component image for the release version runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images strategy: matrix: component: [ + datacatalog, + flyteadmin, flyteconsole, + flytecopilot, flytepropeller, - flyteadmin, - datacatalog, flytescheduler, - flytecopilot, ] steps: - name: Checkout @@ -76,11 +105,7 @@ jobs: - name: Get Latest Version of component id: set_version run: | - if [ ${{ matrix.component }} = "flytecopilot" ]; then - echo ::set-output name=version::$(yq eval '.configmap.copilot.plugins.k8s.co-pilot.image' charts/flyte-core/values.yaml | cut -d ":" -f 2 ) - else - echo ::set-output name=version::$(yq eval '.${{ matrix.component }}.image.tag' charts/flyte-core/values.yaml) - fi + echo ::set-output name=version::$(yq eval '.${{ matrix.component }}.image.tag' charts/flyte-core/values.yaml) shell: bash - name: Login to GitHub Container Registry @@ -92,14 +117,15 @@ jobs: - name: Tag Image to release version run: | - for release in latest ${{ needs.bump-version.outputs.version }}; do + for release in latest ${{ github.event.inputs.version }}; do docker buildx imagetools create --tag "ghcr.io/${{ github.repository_owner }}/${{ matrix.component }}-release:${release}" "ghcr.io/${{ github.repository_owner }}/${{ matrix.component }}:${{ steps.set_version.outputs.version }}" done helm-release: name: Flyte helm release runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images steps: - name: Checkout uses: actions/checkout@v2 @@ -118,7 +144,7 @@ jobs: git config user.email "${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>" - name: Prepare Flyte Helm Release env: - VERSION: ${{ needs.bump-version.outputs.version }} + VERSION: ${{ github.event.inputs.version }} REPOSITORY: "https://flyteorg.github.io/flyte" run: | make prepare_artifacts @@ -131,7 +157,8 @@ jobs: manifest-release: name: Flyte manifest release runs-on: ubuntu-latest - needs: [bump-version] + needs: + - build-docker-images steps: - name: Checkout uses: actions/checkout@v2 @@ -140,7 +167,7 @@ jobs: - name: Prepare Flyte Release env: - VERSION: ${{ needs.bump-version.outputs.version }} + VERSION: ${{ github.event.inputs.version }} run: | make prepare_artifacts git stash diff --git a/.github/workflows/end2end.yml b/.github/workflows/end2end.yml index ad21cf2cf1..cee310b195 100644 --- a/.github/workflows/end2end.yml +++ b/.github/workflows/end2end.yml @@ -87,9 +87,8 @@ jobs: advanced_composition/advanced_composition/dynamics.py \ advanced_composition/advanced_composition/map_task.py \ advanced_composition/advanced_composition/subworkflows.py \ - data_types_and_io/data_types_and_io/custom_objects.py \ - data_types_and_io/data_types_and_io/schema.py \ - data_types_and_io/data_types_and_io/typed_schema.py ; + data_types_and_io/data_types_and_io/dataclass.py \ + data_types_and_io/data_types_and_io/structured_dataset.py ; do pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \ register \ diff --git a/.github/workflows/flyteidl-checks.yml b/.github/workflows/flyteidl-checks.yml index cbf84b97f1..5cdf9733b2 100644 --- a/.github/workflows/flyteidl-checks.yml +++ b/.github/workflows/flyteidl-checks.yml @@ -6,13 +6,9 @@ concurrency: on: pull_request: - paths: - - 'flyteidl/**' push: branches: - master - paths: - - 'flyteidl/**' env: GO_VERSION: "1.19" jobs: diff --git a/.github/workflows/generate_flyte_manifest.yml b/.github/workflows/generate_flyte_manifest.yml index cf5e4e3123..913da71a7a 100644 --- a/.github/workflows/generate_flyte_manifest.yml +++ b/.github/workflows/generate_flyte_manifest.yml @@ -2,11 +2,17 @@ name: Generate Flyte manifest on: workflow_dispatch: + inputs: + next-version: + description: "version name. example v0.1.1" + required: true jobs: update-flyte-releases: name: Update Flyte components runs-on: ubuntu-latest + needs: + - generate-tags steps: - uses: actions/checkout@v2 with: @@ -14,7 +20,9 @@ jobs: - uses: actions/setup-go@v2 with: go-version: "1.19" - - name: Build kustomize + - name: Update references + env: + VERSION: ${{ github.event.inputs.next-version }} run: | make release_automation make kustomize diff --git a/.github/workflows/publish-images.yml b/.github/workflows/publish-images.yml new file mode 100644 index 0000000000..6c12e9a0ce --- /dev/null +++ b/.github/workflows/publish-images.yml @@ -0,0 +1,41 @@ +name: Build & Push All Components Images + +on: + workflow_call: + inputs: + version: + description: "Version of image" + required: false + type: string + default: ci-checks + push: + description: "Push to registry" + required: true + type: boolean + secrets: + FLYTE_BOT_USERNAME: + required: true + FLYTE_BOT_PAT: + required: true + +jobs: + push_docker_image: + name: Build & Push Image + strategy: + fail-fast: false + matrix: + component: + - datacatalog + - flyteadmin + - flytecopilot + - flytepropeller + - flytescheduler + uses: ./.github/workflows/publish.yml + with: + version: ${{ inputs.version }} + component: ${{ matrix.component }} + dockerfile: Dockerfile.${{ matrix.component }} + push: ${{ inputs.push }} + secrets: + FLYTE_BOT_PAT: ${{ secrets.FLYTE_BOT_PAT }} + FLYTE_BOT_USERNAME: ${{ secrets.FLYTE_BOT_USERNAME }} diff --git a/.github/workflows/single-binary.yml b/.github/workflows/single-binary.yml index c75892a292..630068da00 100644 --- a/.github/workflows/single-binary.yml +++ b/.github/workflows/single-binary.yml @@ -201,9 +201,8 @@ jobs: advanced_composition/advanced_composition/dynamics.py \ advanced_composition/advanced_composition/map_task.py \ advanced_composition/advanced_composition/subworkflows.py \ - data_types_and_io/data_types_and_io/custom_objects.py \ - data_types_and_io/data_types_and_io/schema.py \ - data_types_and_io/data_types_and_io/typed_schema.py ; + data_types_and_io/data_types_and_io/dataclass.py \ + data_types_and_io/data_types_and_io/structured_dataset.py ; do pyflyte --config ./boilerplate/flyte/end2end/functional-test-config.yaml \ register \ diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c5e209e044..428d42b5ed 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -12,11 +12,14 @@ jobs: - uses: actions/stale@v8 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-issue-message: "Hello πŸ‘‹, This issue has been inactive for over 9 months. To help maintain a clean and focused backlog, we'll be marking this issue as stale and will close the issue if we detect no activity in the next 7 days. Thank you for your contribution and understanding! πŸ™" - close-issue-message: "Hello πŸ‘‹, This issue has been inactive for over 9 months and hasn't received any updates since it was marked as stale. We'll be closing this issue for now, but if you believe this issue is still relevant, please feel free to reopen it. Thank you for your contribution and understanding! πŸ™" + stale-issue-message: > + Hello πŸ‘‹, this issue has been inactive for over 9 months. To help maintain a clean and focused backlog, + we'll be marking this issue as stale and will engage on it to decide if it is still applicable. + + Thank you for your contribution and understanding! πŸ™ stale-issue-label: "stale" - exempt-issue-labels: "needs discussion" # Comma-separated list of labels. + exempt-issue-labels: "needs discussion,untriaged" # Comma-separated list of labels. days-before-stale: 270 - days-before-close: 7 + days-before-close: -1 ascending: true # https://github.com/actions/stale#ascending operations-per-run: 500 diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md deleted file mode 100644 index eaab5d9c28..0000000000 --- a/CONTRIBUTORS.md +++ /dev/null @@ -1,5 +0,0 @@ -### Contributors - - - - diff --git a/Dockerfile b/Dockerfile index ea5cba63f3..0134bd8cea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ARG FLYTECONSOLE_VERSION=latest FROM ghcr.io/flyteorg/flyteconsole:${FLYTECONSOLE_VERSION} AS flyteconsole -FROM --platform=${BUILDPLATFORM} golang:1.19.1-bullseye AS flytebuilder +FROM --platform=${BUILDPLATFORM} golang:1.19.13-bookworm AS flytebuilder ARG TARGETARCH ENV GOARCH "${TARGETARCH}" @@ -26,7 +26,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/r go build -tags console -v -o dist/flyte cmd/main.go -FROM debian:bullseye-slim +FROM debian:bookworm-slim ARG FLYTE_VERSION ENV FLYTE_VERSION "${FLYTE_VERSION}" diff --git a/README.md b/README.md index ba0a5537e0..b83a3f81c5 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

- πŸ—οΈ πŸš€ πŸ“ˆ + :building_construction: :rocket: :chart_with_upwards_trend:

@@ -20,21 +20,21 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr

- Current Release + Current Release label - Sandbox Status + Sandbox Status label - Test Status + Test Status label - License + License label - OpenSSF Best Practices + OpenSSF Best Practices label - Flyte Helm Chart + Flyte Helm Chart label - - Flyte Slack + + Flyte Slack label

@@ -47,7 +47,7 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr
-Getting started with Flyte +Getting started with Flyte, showing the welcome screen and Flyte dashboard --- @@ -55,34 +55,34 @@ Flyte is an **open-source orchestrator** that facilitates building production-gr We ship new features, bug fixes and performance improvements regularly. Read our [release notes](https://github.com/flyteorg/flyte/releases) to stay updated. -πŸš€ **Strongly typed interfaces**: Validate your data at every step of the workflow by defining data guardrails using Flyte types.
-🌐 **Any language**: Write code in any language using raw containers, or choose [Python](https://github.com/flyteorg/flytekit), [Java](https://github.com/flyteorg/flytekit-java), [Scala](https://github.com/flyteorg/flytekit-java) or [JavaScript](https://github.com/NotMatthewGriffin/pterodactyl) SDKs to develop your Flyte workflows.
-πŸ“Š **Map tasks**: Achieve parallel code execution with minimal configuration using [map tasks](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/map_task.html).
-🌟 **Dynamic workflows**: [Build flexible and adaptable workflows](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/dynamics.html) that can change and evolve as needed, making it easier to respond to changing requirements.
-🌳 **Branching**: [Selectively execute branches](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/conditions.html) of your workflow based on static or dynamic data produced by other tasks or input data.
-πŸ“‚ **FlyteFile & FlyteDirectory**: Transfer [files](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/files.html) and [directories](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/folders.html) between local and cloud storage.
-πŸ—ƒοΈ **Structured dataset**: Convert dataframes between types and enforce column-level type checking using the abstract 2D representation provided by [Structured Dataset](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/data_types_and_io/structured_dataset.html).
-πŸ›‘οΈ **Recover from failures**: Recover only the failed tasks.
-πŸ” **Rerun a single task**: Rerun workflows at the most granular level without modifying the previous state of a data/ML workflow.
-🚦 **Versioned workflows**: Reproduce results and roll back to a previous workflow version any time.
-πŸ” **Cache outputs**: Cache task outputs by passing `cache=True` to the task decorator.
-🚩 **Intra-task checkpointing**: [Checkpoint progress](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/checkpoint.html) within a task execution.
-🌎 **Multi-tenancy**: Multiple users can share the same platform while maintaining their own distinct data and configurations.
-⏰ **Timeout**: Define a timeout period, after which the task is marked as failure.
-πŸ”’ **Immutability**: Immutable executions help ensure reproducibility by preventing any changes to the state of an execution.
-🧬 **Data lineage**: Track the movement and transformation of data throughout the lifecycle of your data and ML workflows.
-πŸ“ˆ **Data visualization**: Visualize data, monitor models and view training history through plots.
-🏭 **Dev to prod**: As simple as changing your [domain](https://docs.flyte.org/en/latest/concepts/domains.html) from development or staging to production.
-πŸ’Έ **Spot or preemptible instances**: Schedule your workflows on spot instances by setting `interruptible` to `True` in the task decorator.
-☁️ **Cloud-native deployment**: Deploy Flyte on AWS, GCP, Azure and other cloud services.
-πŸ“… **Scheduling**: [Schedule](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_schedules.html) your data and ML workflows to run at a specific time.
-πŸ“’ **Notifications**: Stay informed about changes to your workflow's state by configuring [notifications](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_notifications.html) through Slack, PagerDuty or email.
-βŒ›οΈ **Timeline view**: Evaluate the duration of each of your Flyte tasks and identify potential bottlenecks.
-πŸ’¨ **GPU acceleration**: Enable and control your tasks’ GPU demands by requesting resources in the task decorator.
-🐳 **Dependency isolation via containers**: Maintain separate sets of dependencies for your tasks so no dependency conflicts arise.
-πŸ”€ **Parallelism**: Flyte tasks are inherently parallel to optimize resource consumption and improve performance.
-πŸ’Ύ **Allocate resources dynamically** at the task level.
-⏯️ [Wait](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/waiting_for_external_inputs.html) for **external inputs** before proceeding with the execution.
+:rocket: **Strongly typed interfaces**: Validate your data at every step of the workflow by defining data guardrails using Flyte types.
+:globe_with_meridians: **Any language**: Write code in any language using raw containers, or choose [Python](https://github.com/flyteorg/flytekit), [Java](https://github.com/flyteorg/flytekit-java), [Scala](https://github.com/flyteorg/flytekit-java) or [JavaScript](https://github.com/NotMatthewGriffin/pterodactyl) SDKs to develop your Flyte workflows.
+:bar_chart: **Map tasks**: Achieve parallel code execution with minimal configuration using [map tasks](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/map_task.html).
+:star2: **Dynamic workflows**: [Build flexible and adaptable workflows](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/dynamics.html) that can change and evolve as needed, making it easier to respond to changing requirements.
+:deciduous_tree: **Branching**: [Selectively execute branches](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/conditions.html) of your workflow based on static or dynamic data produced by other tasks or input data.
+:open_file_folder: **FlyteFile & FlyteDirectory**: Transfer [files](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/files.html) and [directories](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/folders.html) between local and cloud storage.
+:card_file_box: **Structured dataset**: Convert dataframes between types and enforce column-level type checking using the abstract 2D representation provided by [Structured Dataset](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/data_types_and_io/structured_dataset.html).
+:shield: **Recover from failures**: Recover only the failed tasks.
+:arrows_counterclockwise: **Rerun a single task**: Rerun workflows at the most granular level without modifying the previous state of a data/ML workflow.
+:vertical_traffic_light: **Versioned workflows**: Reproduce results and roll back to a previous workflow version any time.
+:mag: **Cache outputs**: Cache task outputs by passing `cache=True` to the task decorator.
+:triangular_flag_on_post: **Intra-task checkpointing**: [Checkpoint progress](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/checkpoint.html) within a task execution.
+:earth_americas: **Multi-tenancy**: Multiple users can share the same platform while maintaining their own distinct data and configurations.
+:alarm_clock: **Timeout**: Define a timeout period, after which the task is marked as failure.
+:lock: **Immutability**: Immutable executions help ensure reproducibility by preventing any changes to the state of an execution.
+:dna: **Data lineage**: Track the movement and transformation of data throughout the lifecycle of your data and ML workflows.
+:chart_with_upwards_trend: **Data visualization**: Visualize data, monitor models and view training history through plots.
+:factory: **Dev to prod**: As simple as changing your [domain](https://docs.flyte.org/en/latest/concepts/domains.html) from development or staging to production.
+:money_with_wings: **Spot or preemptible instances**: Schedule your workflows on spot instances by setting `interruptible` to `True` in the task decorator.
+:cloud: **Cloud-native deployment**: Deploy Flyte on AWS, GCP, Azure and other cloud services.
+:calendar: **Scheduling**: [Schedule](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_schedules.html) your data and ML workflows to run at a specific time.
+:mega: **Notifications**: Stay informed about changes to your workflow's state by configuring [notifications](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/productionizing/lp_notifications.html) through Slack, PagerDuty or email.
+:hourglass: **Timeline view**: Evaluate the duration of each of your Flyte tasks and identify potential bottlenecks.
+:fast_forward: **GPU acceleration**: Enable and control your tasks’ GPU demands by requesting resources in the task decorator.
+:whale: **Dependency isolation via containers**: Maintain separate sets of dependencies for your tasks so no dependency conflicts arise.
+:arrows_counterclockwise: **Parallelism**: Flyte tasks are inherently parallel to optimize resource consumption and improve performance.
+:floppy_disk: **Allocate resources dynamically** at the task level.
+:play_or_pause_button: [Wait](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/advanced_composition/waiting_for_external_inputs.html) for **external inputs** before proceeding with the execution.
## Quick start @@ -90,7 +90,7 @@ If you want to try out Flyte, the easiest way to get started is by using the Fly However, if you prefer to install Flyte locally and run a workflow, our [getting started guide](https://docs.flyte.org/projects/cookbook/en/latest/index.html) is the best place to start. It provides step-by-step instructions on how to install Flyte locally and run your first workflow. -> If you're unsure what a Flyte [task](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/task.html) and [workflow](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/workflow.html) are, be sure to check out our guides on both! +> If you're unsure what either a [Flyte task](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/task.html), or a [workflow](https://docs.flyte.org/projects/cookbook/en/latest/auto_examples/basics/workflow.html) is, be sure to check out our guides on both! ### Deploy Flyte to production @@ -103,13 +103,13 @@ Join the likes of LinkedIn, Spotify, Freenome, Pachama, Gojek, and Woven Planet ## Resources -πŸ“† [Weekly office hours](https://calendly.com/flyte-office-hours-01/30min): Live informal sessions with the Flyte team held every week. Book a 30-minute slot and get your questions answered.
-πŸ‘₯ [Biweekly community sync](https://www.addevent.com/event/EA7823958): A biweekly event where the Flyte team provides updates on the project and community members can share their progress and ask questions.
-πŸ’¬ [Slack](https://slack.flyte.org/): Join the Flyte community on Slack to chat with other users, ask questions, and get help.
-⚠️ [GitHub](https://github.com/flyteorg/flyte): Check out the Flyte project on GitHub to file issues, contribute code, and stay up to date on the latest development.
-πŸ“Ή [Youtube](https://www.youtube.com/channel/UCNduEoLOToNo3nFVly-vUTQ): Tune into panel discussions, customer success stories, community updates and feature deep dives.
-πŸ“„ [Blog](https://flyte.org/blog): Here, you can find tutorials and feature deep dives to help you learn more about Flyte.
-πŸ’‘ [RFCs](rfc/.): RFCs are used for proposing new ideas and features to improve Flyte. You can refer to them to stay updated on the latest developments and contribute to the growth of the platform. +:spiral_calendar: [Weekly office hours](https://calendly.com/flyte-office-hours-01/30min): Live informal sessions with the Flyte team held every week. Book a 30-minute slot and get your questions answered.
+:couple: [Biweekly community sync](https://www.addevent.com/event/EA7823958): A biweekly event where the Flyte team provides updates on the project and community members can share their progress and ask questions.
+:speech_balloon: [Slack](https://slack.flyte.org/): Join the Flyte community on Slack to chat with other users, ask questions, and get help.
+:warning: [GitHub](https://github.com/flyteorg/flyte): Check out the Flyte project on GitHub to file issues, contribute code, and stay up to date on the latest development.
+:video_camera: [Youtube](https://www.youtube.com/channel/UCNduEoLOToNo3nFVly-vUTQ): Tune into panel discussions, customer success stories, community updates and feature deep dives.
+:writing_hand: [Blog](https://flyte.org/blog): Here, you can find tutorials and feature deep dives to help you learn more about Flyte.
+:bulb: [RFCs](rfc/.): RFCs are used for proposing new ideas and features to improve Flyte. You can refer to them to stay updated on the latest developments and contribute to the growth of the platform. ## How to contribute @@ -121,7 +121,7 @@ There are many ways to get involved in Flyte, including: - Taking on a [`help wanted`](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22+) or [`good-first-issue`](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) and following the [CONTRIBUTING](https://docs.flyte.org/en/latest/community/contribute.html) guide to submit changes to the codebase. - Upvoting [popular feature requests](https://github.com/flyteorg/flyte/issues?q=is%3Aopen+is%3Aissue+label%3Aenhancement+sort%3Areactions-%2B1-desc) to show your support. -### We ❀️ our contributors +### We :heart: our contributors diff --git a/boilerplate/flyte/end2end/run-tests.py b/boilerplate/flyte/end2end/run-tests.py index 5365da006e..9f8a8e85cb 100644 --- a/boilerplate/flyte/end2end/run-tests.py +++ b/boilerplate/flyte/end2end/run-tests.py @@ -5,7 +5,7 @@ import sys import time import traceback -from typing import Dict, List, Mapping, Tuple +from typing import Dict, List, Mapping, Tuple, Optional import click import requests @@ -50,15 +50,17 @@ ("basics.named_outputs.simple_wf_with_named_outputs", {}), # # Getting a 403 for the wikipedia image # # ("basics.reference_task.wf", {}), - ("data_types_and_io.custom_objects.wf", {"x": 10, "y": 20}), + ("data_types_and_io.dataclass.dataclass_wf", {"x": 10, "y": 20}), # Enums are not supported in flyteremote # ("type_system.enums.enum_wf", {"c": "red"}), - ("data_types_and_io.schema.df_wf", {"a": 42}), - ("data_types_and_io.typed_schema.wf", {}), + ("data_types_and_io.structured_dataset.simple_sd_wf", {"a": 42}), # ("my.imperative.workflow.example", {"in1": "hello", "in2": "foo"}), ], "integrations-k8s-spark": [ - ("k8s_spark_plugin.pyspark_pi.my_spark", {"triggered_date": datetime.datetime.now()}), + ( + "k8s_spark_plugin.pyspark_pi.my_spark", + {"triggered_date": datetime.datetime.now()}, + ), ], "integrations-kfpytorch": [ ("kfpytorch_plugin.pytorch_mnist.pytorch_training_wf", {}), @@ -89,20 +91,30 @@ } -def execute_workflow(remote, version, workflow_name, inputs): +def execute_workflow( + remote: FlyteRemote, + version, + workflow_name, + inputs, + cluster_pool_name: Optional[str] = None, +): print(f"Fetching workflow={workflow_name} and version={version}") wf = remote.fetch_workflow(name=workflow_name, version=version) - return remote.execute(wf, inputs=inputs, wait=False) + return remote.execute(wf, inputs=inputs, wait=False, cluster_pool=cluster_pool_name) -def executions_finished(executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]]) -> bool: +def executions_finished( + executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]] +) -> bool: for executions in executions_by_wfgroup.values(): if not all([execution.is_done for execution in executions]): return False return True -def sync_executions(remote: FlyteRemote, executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]]): +def sync_executions( + remote: FlyteRemote, executions_by_wfgroup: Dict[str, List[FlyteWorkflowExecution]] +): try: for executions in executions_by_wfgroup.values(): for execution in executions: @@ -125,6 +137,7 @@ def schedule_workflow_groups( workflow_groups: List[str], remote: FlyteRemote, terminate_workflow_on_failure: bool, + cluster_pool_name: Optional[str] = None, ) -> Dict[str, bool]: """ Schedule workflows executions for all workflow groups and return True if all executions succeed, otherwise @@ -135,14 +148,19 @@ def schedule_workflow_groups( for wf_group in workflow_groups: workflows = FLYTESNACKS_WORKFLOW_GROUPS.get(wf_group, []) executions_by_wfgroup[wf_group] = [ - execute_workflow(remote, tag, workflow[0], workflow[1]) for workflow in workflows + execute_workflow(remote, tag, workflow[0], workflow[1], cluster_pool_name) + for workflow in workflows ] # Wait for all executions to finish attempt = 0 - while attempt == 0 or (not executions_finished(executions_by_wfgroup) and attempt < MAX_ATTEMPTS): + while attempt == 0 or ( + not executions_finished(executions_by_wfgroup) and attempt < MAX_ATTEMPTS + ): attempt += 1 - print(f"Not all executions finished yet. Sleeping for some time, will check again in {WAIT_TIME}s") + print( + f"Not all executions finished yet. Sleeping for some time, will check again in {WAIT_TIME}s" + ) time.sleep(WAIT_TIME) sync_executions(remote, executions_by_wfgroup) @@ -158,9 +176,13 @@ def schedule_workflow_groups( if len(non_succeeded_executions) != 0: print(f"Failed executions for {wf_group}:") for execution in non_succeeded_executions: - print(f" workflow={execution.spec.launch_plan.name}, execution_id={execution.id.name}") + print( + f" workflow={execution.spec.launch_plan.name}, execution_id={execution.id.name}" + ) if terminate_workflow_on_failure: - remote.terminate(execution, "aborting execution scheduled in functional test") + remote.terminate( + execution, "aborting execution scheduled in functional test" + ) # A workflow group succeeds iff all of its executions succeed results[wf_group] = len(non_succeeded_executions) == 0 return results @@ -179,17 +201,21 @@ def run( priorities: List[str], config_file_path, terminate_workflow_on_failure: bool, + test_project_name: str, + test_project_domain: str, + cluster_pool_name: Optional[str] = None, ) -> List[Dict[str, str]]: remote = FlyteRemote( Config.auto(config_file=config_file_path), - default_project="flytesnacks", - default_domain="development", + test_project_name, + test_project_domain, ) # For a given release tag and priority, this function filters the workflow groups from the flytesnacks # manifest file. For example, for the release tag "v0.2.224" and the priority "P0" it returns [ "core" ]. manifest_url = ( - "https://raw.githubusercontent.com/flyteorg/flytesnacks/" f"{flytesnacks_release_tag}/flyte_tests_manifest.json" + "https://raw.githubusercontent.com/flyteorg/flytesnacks/" + f"{flytesnacks_release_tag}/flyte_tests_manifest.json" ) r = requests.get(manifest_url) parsed_manifest = r.json() @@ -197,7 +223,11 @@ def run( workflow_groups = ( ["lite"] if "lite" in priorities - else [group["name"] for group in parsed_manifest if group["priority"] in priorities] + else [ + group["name"] + for group in parsed_manifest + if group["priority"] in priorities + ] ) results = [] @@ -215,7 +245,11 @@ def run( valid_workgroups.append(workflow_group) results_by_wfgroup = schedule_workflow_groups( - flytesnacks_release_tag, valid_workgroups, remote, terminate_workflow_on_failure + flytesnacks_release_tag, + valid_workgroups, + remote, + terminate_workflow_on_failure, + cluster_pool_name, ) for workflow_group, succeeded in results_by_wfgroup.items(): @@ -246,6 +280,9 @@ def run( @click.command() +@click.argument("flytesnacks_release_tag") +@click.argument("priorities") +@click.argument("config_file") @click.option( "--return_non_zero_on_failure", default=False, @@ -258,18 +295,46 @@ def run( is_flag=True, help="Abort failing workflows upon exit", ) -@click.argument("flytesnacks_release_tag") -@click.argument("priorities") -@click.argument("config_file") +@click.option( + "--test_project_name", + default="flytesnacks", + type=str, + is_flag=False, + help="Name of project to run functional tests on", +) +@click.option( + "--test_project_domain", + default="development", + type=str, + is_flag=False, + help="Name of domain in project to run functional tests on", +) +@click.argument( + "cluster_pool_name", + required=False, + type=str, + default=None, +) def cli( flytesnacks_release_tag, priorities, config_file, return_non_zero_on_failure, terminate_workflow_on_failure, + test_project_name, + test_project_domain, + cluster_pool_name, ): print(f"return_non_zero_on_failure={return_non_zero_on_failure}") - results = run(flytesnacks_release_tag, priorities, config_file, terminate_workflow_on_failure) + results = run( + flytesnacks_release_tag, + priorities, + config_file, + terminate_workflow_on_failure, + test_project_name, + test_project_domain, + cluster_pool_name, + ) # Write a json object in its own line describing the result of this run to stdout print(f"Result of run:\n{json.dumps(results)}") diff --git a/boilerplate/flyte/golang_test_targets/Makefile b/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..d3ab9817f7 100644 --- a/boilerplate/flyte/golang_test_targets/Makefile +++ b/boilerplate/flyte/golang_test_targets/Makefile @@ -3,6 +3,9 @@ # # TO OPT OUT OF UPDATES, SEE https://github.com/flyteorg/boilerplate/blob/master/Readme.rst +.PHONY: codespell +codespell: + git ls-files | grep -vE 'go\.mod|go\.sum|flyteidl/|\.pb$$|\.git|\.pdf|\.svg|requirements\.txt|gen/' | xargs codespell -w .PHONY: download_tooling download_tooling: #download dependencies (including test deps) for the package @@ -13,8 +16,8 @@ generate: download_tooling #generate go code @boilerplate/flyte/golang_test_targets/go-gen.sh .PHONY: lint -lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v +lint: codespell download_tooling #lints the package for common code smells + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/charts/flyte-binary/eks-starter.yaml b/charts/flyte-binary/eks-starter.yaml index 1ffe5146c6..c7ca135340 100644 --- a/charts/flyte-binary/eks-starter.yaml +++ b/charts/flyte-binary/eks-starter.yaml @@ -1,8 +1,9 @@ configuration: database: + username: password: host: - dbname: app # what is this? why not flyteadmin or flyte? + dbname: flyteadmin () storage: metadataContainer: userDataContainer: diff --git a/charts/flyte-binary/templates/deployment.yaml b/charts/flyte-binary/templates/deployment.yaml index cc8beff2d9..d1be13d166 100644 --- a/charts/flyte-binary/templates/deployment.yaml +++ b/charts/flyte-binary/templates/deployment.yaml @@ -193,6 +193,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 {{- end }} readinessProbe: {{- if .Values.deployment.readinessProbe }} @@ -201,6 +202,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 {{- end }} {{- if .Values.deployment.resources }} resources: {{- toYaml .Values.deployment.resources | nindent 12 }} diff --git a/charts/flyte-core/templates/admin/configmap.yaml b/charts/flyte-core/templates/admin/configmap.yaml index 53af659e8b..04e5cac6b3 100644 --- a/charts/flyte-core/templates/admin/configmap.yaml +++ b/charts/flyte-core/templates/admin/configmap.yaml @@ -48,6 +48,11 @@ data: notifications.yaml: | notifications: type: {{ .Values.workflow_notifications.config.notifications.type }} + {{- if not .Values.workflow_notifications.config.notifications.aws }} + {{- with .Values.workflow_notifications.config.notifications.region }} + region: {{ tpl . $ }} + {{- end }} + {{- end }} {{- if eq .Values.workflow_notifications.config.notifications.type "aws" }} {{- with .Values.workflow_notifications.config.notifications.aws }} aws: {{ tpl (toYaml .) $ | nindent 8 }} diff --git a/charts/flyte-core/templates/common/ingress.yaml b/charts/flyte-core/templates/common/ingress.yaml index 19fd66563e..2a45152a0b 100644 --- a/charts/flyte-core/templates/common/ingress.yaml +++ b/charts/flyte-core/templates/common/ingress.yaml @@ -1,4 +1,8 @@ {{- define "grpcRoutes" -}} +{{- $grpcPort := 81 -}} +{{- if eq .Values.configmap.adminServer.server.security.secure true -}} + {{- $grpcPort = 80 -}} +{{- end }} # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific @@ -7,10 +11,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.SignalService/* pathType: ImplementationSpecific @@ -19,10 +23,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AdminService pathType: ImplementationSpecific @@ -31,10 +35,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AdminService/* pathType: ImplementationSpecific @@ -43,10 +47,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.DataProxyService pathType: ImplementationSpecific @@ -55,10 +59,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.DataProxyService/* pathType: ImplementationSpecific @@ -67,10 +71,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AuthMetadataService pathType: ImplementationSpecific @@ -79,10 +83,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.AuthMetadataService/* pathType: ImplementationSpecific @@ -91,10 +95,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.IdentityService pathType: ImplementationSpecific @@ -103,10 +107,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /flyteidl.service.IdentityService/* pathType: ImplementationSpecific @@ -115,10 +119,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /grpc.health.v1.Health pathType: ImplementationSpecific @@ -127,10 +131,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} - path: /grpc.health.v1.Health/* pathType: ImplementationSpecific @@ -139,10 +143,10 @@ service: name: flyteadmin port: - number: 81 + number: {{ $grpcPort }} {{- else }} serviceName: flyteadmin - servicePort: 81 + servicePort: {{ $grpcPort }} {{- end }} {{- end }} {{- if .Values.common.ingress.enabled }} diff --git a/charts/flyte-core/templates/datacatalog/service.yaml b/charts/flyte-core/templates/datacatalog/service.yaml index 7ea5c84412..494d7c2b38 100644 --- a/charts/flyte-core/templates/datacatalog/service.yaml +++ b/charts/flyte-core/templates/datacatalog/service.yaml @@ -13,10 +13,6 @@ spec: type: {{ . }} {{- end }} ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP diff --git a/charts/flyte-sandbox/templates/proxy/deployment.yaml b/charts/flyte-sandbox/templates/proxy/deployment.yaml index 9866466f20..74c6db10e8 100644 --- a/charts/flyte-sandbox/templates/proxy/deployment.yaml +++ b/charts/flyte-sandbox/templates/proxy/deployment.yaml @@ -25,6 +25,13 @@ spec: volumeMounts: - name: config mountPath: /etc/envoy + livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 30 + readinessProbe: + tcpSocket: + port: http volumes: - name: config configMap: diff --git a/datacatalog/boilerplate/flyte/golang_test_targets/Makefile b/datacatalog/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/datacatalog/boilerplate/flyte/golang_test_targets/Makefile +++ b/datacatalog/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml index a05c6869df..00af22b862 100644 --- a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml +++ b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml @@ -797,10 +797,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -1574,6 +1570,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_controlplane_generated.yaml b/deployment/eks/flyte_helm_controlplane_generated.yaml index 323176fea1..a4ea548579 100644 --- a/deployment/eks/flyte_helm_controlplane_generated.yaml +++ b/deployment/eks/flyte_helm_controlplane_generated.yaml @@ -521,10 +521,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -1202,6 +1198,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_dataplane_generated.yaml b/deployment/eks/flyte_helm_dataplane_generated.yaml index 4fc4767953..6746cc72fd 100644 --- a/deployment/eks/flyte_helm_dataplane_generated.yaml +++ b/deployment/eks/flyte_helm_dataplane_generated.yaml @@ -771,6 +771,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/eks/flyte_helm_generated.yaml b/deployment/eks/flyte_helm_generated.yaml index 1139ce2fbc..32399dd395 100644 --- a/deployment/eks/flyte_helm_generated.yaml +++ b/deployment/eks/flyte_helm_generated.yaml @@ -828,10 +828,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -1693,6 +1689,7 @@ spec: # path: /* # pathType: ImplementationSpecific # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_controlplane_generated.yaml b/deployment/gcp/flyte_helm_controlplane_generated.yaml index 27d1319bf4..fc63915736 100644 --- a/deployment/gcp/flyte_helm_controlplane_generated.yaml +++ b/deployment/gcp/flyte_helm_controlplane_generated.yaml @@ -536,10 +536,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -1196,6 +1192,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_dataplane_generated.yaml b/deployment/gcp/flyte_helm_dataplane_generated.yaml index 458d10a150..6b5220ffcd 100644 --- a/deployment/gcp/flyte_helm_dataplane_generated.yaml +++ b/deployment/gcp/flyte_helm_dataplane_generated.yaml @@ -757,6 +757,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/gcp/flyte_helm_generated.yaml b/deployment/gcp/flyte_helm_generated.yaml index ac06058686..8b0378d749 100644 --- a/deployment/gcp/flyte_helm_generated.yaml +++ b/deployment/gcp/flyte_helm_generated.yaml @@ -851,10 +851,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -1694,6 +1690,7 @@ spec: http: paths: # + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml index a72612995d..6be6f29c35 100644 --- a/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml +++ b/deployment/sandbox-binary/flyte_sandbox_binary_helm_generated.yaml @@ -417,10 +417,12 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 readinessProbe: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 volumeMounts: - name: cluster-resource-templates mountPath: /etc/flyte/cluster-resource-templates diff --git a/deployment/sandbox/flyte_helm_generated.yaml b/deployment/sandbox/flyte_helm_generated.yaml index bd5a1ebf83..7086a756ef 100644 --- a/deployment/sandbox/flyte_helm_generated.yaml +++ b/deployment/sandbox/flyte_helm_generated.yaml @@ -6196,10 +6196,6 @@ metadata: spec: type: NodePort ports: - - name: grpc-2 - port: 8089 - protocol: TCP - targetPort: 8089 - name: http port: 88 protocol: TCP @@ -7705,6 +7701,7 @@ spec: name: flyteadmin port: number: 80 + # NOTE: Port 81 in flyteadmin is the GRPC server port for FlyteAdmin. - path: /flyteidl.service.SignalService pathType: ImplementationSpecific diff --git a/doc-requirements.in b/doc-requirements.in index 791b7a9ae7..49f7439659 100644 --- a/doc-requirements.in +++ b/doc-requirements.in @@ -1,3 +1,4 @@ +codespell git+https://github.com/flyteorg/furo@main sphinx sphinx-prompt diff --git a/doc-requirements.txt b/doc-requirements.txt index 557c1958f6..289144e3ad 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # # pip-compile doc-requirements.in @@ -20,6 +20,8 @@ cfgv==3.4.0 # via pre-commit charset-normalizer==3.2.0 # via requests +codespell==2.2.6 + # via -r doc-requirements.in distlib==0.3.7 # via virtualenv docutils==0.17.1 @@ -37,6 +39,8 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx +importlib-metadata==6.8.0 + # via sphinx jinja2==3.0.3 # via # sphinx @@ -92,7 +96,6 @@ sphinx==4.5.0 # sphinx-tags # sphinxcontrib-video # sphinxcontrib-youtube - # sphinxext-remoteliteralinclude sphinx-autoapi==2.0.1 # via -r doc-requirements.in sphinx-basic-ng==1.0.0b2 @@ -133,6 +136,8 @@ sphinxcontrib-youtube==1.2.0 # via -r doc-requirements.in sphinxext-remoteliteralinclude==0.4.0 # via -r doc-requirements.in +typing-extensions==4.8.0 + # via astroid unidecode==1.3.6 # via sphinx-autoapi urllib3==2.0.6 @@ -141,6 +146,8 @@ virtualenv==20.24.5 # via pre-commit wrapt==1.15.0 # via astroid +zipp==3.17.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index d9d94e5023..2eadbecf80 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: SWRqSVZmendLdVJvamVGRQ== + haSharedSecret: UzFydXlGcGwyM3FJYWpTdw== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1273,6 +1273,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 name: flyte ports: - containerPort: 8088 @@ -1409,7 +1410,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 4e6248034c039b0bc65cf110d26fccc9d88b8a2d49855a0ecba0443f972ca3c3 + checksum/secret: 12acfc99559b7ddf909a9c66f3645153d66e96bc8dd1a7c1e2ff5af865d51827 labels: app: docker-registry release: flyte-sandbox @@ -1697,10 +1698,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config @@ -1744,7 +1752,7 @@ spec: value: minio - name: FLYTE_AWS_SECRET_ACCESS_KEY value: miniostorage - image: ghcr.io/flyteorg/flyteagent:1.9.1 + image: ghcr.io/flyteorg/flyteagent:1.10.0 imagePullPolicy: IfNotPresent name: flyteagent ports: diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index 998f71d0a0..06f86532c6 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -805,7 +805,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: UUREcXo3a1VGNnlyc1RCWg== + haSharedSecret: Y2dKT2xJeEhXaWUyZUcwMg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1230,6 +1230,7 @@ spec: httpGet: path: /healthcheck port: http + initialDelaySeconds: 30 name: flyte ports: - containerPort: 8088 @@ -1366,7 +1367,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: cfe2089fc583f69d068c3b3d56e875082a5d926c70b00b32f094d587df7396a5 + checksum/secret: 9a48d62cb53f25eaefca010e49af275334df350c757deb72a31fca6dcb2cdb95 labels: app: docker-registry release: flyte-sandbox @@ -1654,10 +1655,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index f88f94fcd3..eda6c8c4ca 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: VjRtTTRpSW94OU9rUVFZTQ== + haSharedSecret: OFd1Q2o4V28zVk1lV0lOcA== proxyPassword: "" proxyUsername: "" kind: Secret @@ -933,7 +933,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 8b25114d47a69e875e3841dace0b87abfda7a16f6833a5cf76b4570092f3a1cd + checksum/secret: 83de0933db4333daa08a690fbbf752532361f657be692a243b79daa7dc1fef1c labels: app: docker-registry release: flyte-sandbox @@ -1221,10 +1221,17 @@ spec: containers: - image: envoyproxy/envoy:sandbox imagePullPolicy: Never + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: http name: proxy ports: - containerPort: 8000 name: http + readinessProbe: + tcpSocket: + port: http volumeMounts: - mountPath: /etc/envoy name: config diff --git a/flyteadmin/auth/token.go b/flyteadmin/auth/token.go index 9b90a2d108..97029c870a 100644 --- a/flyteadmin/auth/token.go +++ b/flyteadmin/auth/token.go @@ -131,7 +131,12 @@ func IdentityContextFromIDTokenToken(ctx context.Context, tokenStr, clientID str logger.Infof(ctx, "Failed to unmarshal claims from id token, err: %v", err) } - // TODO: Document why automatically specify "all" scope + // This path is used when a user logs into the UI and when you login through the UI, you should have all the capabilities your identity + // allows you to have, which is denoted by the "all" scope. + // There was a plan to one day define one of a handful of scopes (all, proj admin, user, viewer) and if you configure your IDP + // to issue the right scopes, admin can do very light weight 'AuthZ' on admin based on these scopes, but until that plan is effected, + // we just use this single scope that Admin expects for all methods + // And because not all IdPs allow us to configure the Identity Token claims, the scope needs to live here. return NewIdentityContext(idToken.Audience[0], idToken.Subject, "", idToken.IssuedAt, sets.NewString(ScopeAll), userInfo, claims) } diff --git a/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile b/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteadmin/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteadmin/pkg/async/notifications/factory.go b/flyteadmin/pkg/async/notifications/factory.go index 53e96f7e67..f94129a1d5 100644 --- a/flyteadmin/pkg/async/notifications/factory.go +++ b/flyteadmin/pkg/async/notifications/factory.go @@ -64,7 +64,11 @@ func GetEmailer(config runtimeInterfaces.NotificationsConfig, scope promutils.Sc switch config.Type { case common.AWS: - awsConfig := aws.NewConfig().WithRegion(config.Region).WithMaxRetries(maxRetries) + region := config.AWSConfig.Region + if region == "" { + region = config.Region + } + awsConfig := aws.NewConfig().WithRegion(region).WithMaxRetries(maxRetries) awsSession, err := session.NewSession(awsConfig) if err != nil { panic(err) @@ -98,7 +102,11 @@ func NewNotificationsProcessor(config runtimeInterfaces.NotificationsConfig, sco // However, the message body of SQS is the SNS message format which isn't Base64 encoded. ConsumeBase64: &enable64decoding, } - sqsConfig.Region = config.Region + if config.AWSConfig.Region != "" { + sqsConfig.Region = config.AWSConfig.Region + } else { + sqsConfig.Region = config.Region + } var err error err = async.Retry(reconnectAttempts, reconnectDelay, func() error { sub, err = gizmoAWS.NewSubscriber(sqsConfig) diff --git a/flyteadmin/pkg/errors/errors.go b/flyteadmin/pkg/errors/errors.go index 42ef63841b..e269715a91 100644 --- a/flyteadmin/pkg/errors/errors.go +++ b/flyteadmin/pkg/errors/errors.go @@ -140,3 +140,8 @@ func NewWorkflowExistsIdenticalStructureError(ctx context.Context, request *admi } return statusErr } + +func IsDoesNotExistError(err error) bool { + adminError, ok := err.(FlyteAdminError) + return ok && adminError.Code() == codes.NotFound +} diff --git a/flyteadmin/pkg/errors/errors_test.go b/flyteadmin/pkg/errors/errors_test.go index 4b6b250166..cb6a2a0ae0 100644 --- a/flyteadmin/pkg/errors/errors_test.go +++ b/flyteadmin/pkg/errors/errors_test.go @@ -2,6 +2,7 @@ package errors import ( "context" + "errors" "testing" "github.com/stretchr/testify/assert" @@ -90,3 +91,15 @@ func TestNewWorkflowExistsIdenticalStructureError(t *testing.T) { _, ok = details.GetReason().(*admin.CreateWorkflowFailureReason_ExistsIdenticalStructure) assert.True(t, ok) } + +func TestIsDoesNotExistError(t *testing.T) { + assert.True(t, IsDoesNotExistError(NewFlyteAdminError(codes.NotFound, "foo"))) +} + +func TestIsNotDoesNotExistError(t *testing.T) { + assert.False(t, IsDoesNotExistError(NewFlyteAdminError(codes.Canceled, "foo"))) +} + +func TestIsNotDoesNotExistErrorBecauseOfNoneAdminError(t *testing.T) { + assert.False(t, IsDoesNotExistError(errors.New("foo"))) +} diff --git a/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go b/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go index 712470a1ee..74dad42ce0 100644 --- a/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go +++ b/flyteadmin/pkg/executioncluster/impl/random_cluster_selector.go @@ -6,8 +6,6 @@ import ( "hash/fnv" "math/rand" - "google.golang.org/grpc/codes" - "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/executioncluster" "github.com/flyteorg/flyte/flyteadmin/pkg/executioncluster/interfaces" @@ -102,10 +100,8 @@ func (s RandomClusterSelector) GetTarget(ctx context.Context, spec *executionclu LaunchPlan: spec.LaunchPlan, ResourceType: admin.MatchableResource_EXECUTION_CLUSTER_LABEL, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + return nil, err } var weightedRandomList random.WeightedRandomList diff --git a/flyteadmin/pkg/manager/impl/execution_manager.go b/flyteadmin/pkg/manager/impl/execution_manager.go index e31f501686..06516657d2 100644 --- a/flyteadmin/pkg/manager/impl/execution_manager.go +++ b/flyteadmin/pkg/manager/impl/execution_manager.go @@ -164,11 +164,8 @@ func (m *ExecutionManager) addPluginOverrides(ctx context.Context, executionID * LaunchPlan: launchPlanName, ResourceType: admin.MatchableResource_PLUGIN_OVERRIDE, }) - if err != nil { - ec, ok := err.(errors.FlyteAdminError) - if !ok || ec.Code() != codes.NotFound { - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + return nil, err } if override != nil && override.Attributes != nil && override.Attributes.GetPluginOverrides() != nil { return override.Attributes.GetPluginOverrides().Overrides, nil @@ -427,11 +424,9 @@ func (m *ExecutionManager) getClusterAssignment(ctx context.Context, request *ad Domain: request.Domain, ResourceType: admin.MatchableResource_CLUSTER_ASSIGNMENT, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - logger.Errorf(ctx, "Failed to get cluster assignment overrides with error: %v", err) - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + logger.Errorf(ctx, "Failed to get cluster assignment overrides with error: %v", err) + return nil, err } if resource != nil && resource.Attributes.GetClusterAssignment() != nil { return resource.Attributes.GetClusterAssignment(), nil diff --git a/flyteadmin/pkg/manager/impl/executions/quality_of_service.go b/flyteadmin/pkg/manager/impl/executions/quality_of_service.go index 345b0e926d..a96d99d3d6 100644 --- a/flyteadmin/pkg/manager/impl/executions/quality_of_service.go +++ b/flyteadmin/pkg/manager/impl/executions/quality_of_service.go @@ -43,7 +43,7 @@ func (q qualityOfServiceAllocator) getQualityOfServiceFromDb(ctx context.Context ResourceType: admin.MatchableResource_QUALITY_OF_SERVICE_SPECIFICATION, }) if err != nil { - if _, ok := err.(errors.FlyteAdminError); !ok || err.(errors.FlyteAdminError).Code() != codes.NotFound { + if !errors.IsDoesNotExistError(err) { logger.Warningf(ctx, "Failed to fetch override values when assigning quality of service values for [%+v] with err: %v", workflowIdentifier, err) diff --git a/flyteadmin/pkg/manager/impl/executions/queues.go b/flyteadmin/pkg/manager/impl/executions/queues.go index 9ec4ff865a..5e4706700c 100644 --- a/flyteadmin/pkg/manager/impl/executions/queues.go +++ b/flyteadmin/pkg/manager/impl/executions/queues.go @@ -4,6 +4,7 @@ import ( "context" "math/rand" + "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/impl/resources" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/interfaces" repoInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/repositories/interfaces" @@ -64,7 +65,7 @@ func (q *queueAllocatorImpl) GetQueue(ctx context.Context, identifier core.Ident ResourceType: admin.MatchableResource_EXECUTION_QUEUE, }) - if err != nil { + if err != nil && !errors.IsDoesNotExistError(err) { logger.Warningf(ctx, "Failed to fetch override values when assigning execution queue for [%+v] with err: %v", identifier, err) } diff --git a/flyteadmin/pkg/manager/impl/util/resources.go b/flyteadmin/pkg/manager/impl/util/resources.go index 0180484005..8001b907dd 100644 --- a/flyteadmin/pkg/manager/impl/util/resources.go +++ b/flyteadmin/pkg/manager/impl/util/resources.go @@ -6,6 +6,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" + "github.com/flyteorg/flyte/flyteadmin/pkg/errors" "github.com/flyteorg/flyte/flyteadmin/pkg/manager/interfaces" runtimeInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/runtime/interfaces" workflowengineInterfaces "github.com/flyteorg/flyte/flyteadmin/pkg/workflowengine/interfaces" @@ -100,7 +101,7 @@ func GetTaskResources(ctx context.Context, id *core.Identifier, resourceManager } resource, err := resourceManager.GetResource(ctx, request) - if err != nil { + if err != nil && !errors.IsDoesNotExistError(err) { logger.Infof(ctx, "Failed to fetch override values when assigning task resource default values for [%+v]: %v", id, err) } diff --git a/flyteadmin/pkg/manager/impl/util/shared.go b/flyteadmin/pkg/manager/impl/util/shared.go index cf7febd619..24c97f416a 100644 --- a/flyteadmin/pkg/manager/impl/util/shared.go +++ b/flyteadmin/pkg/manager/impl/util/shared.go @@ -275,12 +275,10 @@ func GetMatchableResource(ctx context.Context, resourceManager interfaces.Resour Workflow: workflowName, ResourceType: resourceType, }) - if err != nil { - if flyteAdminError, ok := err.(errors.FlyteAdminError); !ok || flyteAdminError.Code() != codes.NotFound { - logger.Errorf(ctx, "Failed to get %v overrides in %s project %s domain %s workflow with error: %v", resourceType, - project, domain, workflowName, err) - return nil, err - } + if err != nil && !errors.IsDoesNotExistError(err) { + logger.Errorf(ctx, "Failed to get %v overrides in %s project %s domain %s workflow with error: %v", resourceType, + project, domain, workflowName, err) + return nil, err } return matchableResource, nil } diff --git a/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile b/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytecopilot/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteidl/boilerplate/flyte/golang_test_targets/Makefile b/flyteidl/boilerplate/flyte/golang_test_targets/Makefile index dcd95ef842..71764d8cbf 100644 --- a/flyteidl/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteidl/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go index cf5a85671e..5d194851f2 100644 --- a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go +++ b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator.go @@ -2,7 +2,6 @@ package pkce import ( "context" - "errors" "fmt" "net/http" "net/url" @@ -68,7 +67,8 @@ func (f TokenOrchestrator) FetchTokenFromAuthFlow(ctx context.Context) (*oauth2. // Pass along http client used in oauth2 httpClient, ok := ctx.Value(oauth2.HTTPClient).(*http.Client) if !ok { - return nil, errors.New("Unable to retrieve httpClient used in oauth2 from context") + logger.Debugf(ctx, "using default http client instead") + httpClient = http.DefaultClient } serveMux.HandleFunc(redirectURL.Path, getAuthServerCallbackHandler(f.ClientConfig, pkceCodeVerifier, diff --git a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go index 4799e3fabd..dc1c80f63a 100644 --- a/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go +++ b/flyteidl/clients/go/admin/pkce/auth_flow_orchestrator_test.go @@ -2,6 +2,7 @@ package pkce import ( "context" + "os/exec" "testing" "github.com/stretchr/testify/assert" @@ -29,5 +30,10 @@ func TestFetchFromAuthFlow(t *testing.T) { refreshedToken, err := orchestrator.FetchTokenFromAuthFlow(ctx) assert.Nil(t, refreshedToken) assert.NotNil(t, err) + // Throws an exitError since browser cannot be opened. + // But this makes sure that at least code is tested till the browser action is invoked. + // Earlier change had broken this by introducing a hard dependency on the http Client from the context + _, isAnExitError := err.(*exec.ExitError) + assert.True(t, isAnExitError) }) } diff --git a/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile b/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile +++ b/flyteplugins/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flyteplugins/go/tasks/logs/config.go b/flyteplugins/go/tasks/logs/config.go index 69ef17ed89..ca5a6012a8 100644 --- a/flyteplugins/go/tasks/logs/config.go +++ b/flyteplugins/go/tasks/logs/config.go @@ -1,45 +1,34 @@ package logs import ( - "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/config" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" ) //go:generate pflags LogConfig --default-var=DefaultConfig -// TemplateURI is a URI that accepts templates. See: go/tasks/pluginmachinery/tasklog/template.go for available templates. -type TemplateURI = string - // LogConfig encapsulates plugins' log configs type LogConfig struct { IsCloudwatchEnabled bool `json:"cloudwatch-enabled" pflag:",Enable Cloudwatch Logging"` // Deprecated: Please use CloudwatchTemplateURI CloudwatchRegion string `json:"cloudwatch-region" pflag:",AWS region in which Cloudwatch logs are stored."` // Deprecated: Please use CloudwatchTemplateURI - CloudwatchLogGroup string `json:"cloudwatch-log-group" pflag:",Log group to which streams are associated."` - CloudwatchTemplateURI TemplateURI `json:"cloudwatch-template-uri" pflag:",Template Uri to use when building cloudwatch log links"` + CloudwatchLogGroup string `json:"cloudwatch-log-group" pflag:",Log group to which streams are associated."` + CloudwatchTemplateURI tasklog.TemplateURI `json:"cloudwatch-template-uri" pflag:",Template Uri to use when building cloudwatch log links"` IsKubernetesEnabled bool `json:"kubernetes-enabled" pflag:",Enable Kubernetes Logging"` // Deprecated: Please use KubernetesTemplateURI - KubernetesURL string `json:"kubernetes-url" pflag:",Console URL for Kubernetes logs"` - KubernetesTemplateURI TemplateURI `json:"kubernetes-template-uri" pflag:",Template Uri to use when building kubernetes log links"` + KubernetesURL string `json:"kubernetes-url" pflag:",Console URL for Kubernetes logs"` + KubernetesTemplateURI tasklog.TemplateURI `json:"kubernetes-template-uri" pflag:",Template Uri to use when building kubernetes log links"` IsStackDriverEnabled bool `json:"stackdriver-enabled" pflag:",Enable Log-links to stackdriver"` // Deprecated: Please use StackDriverTemplateURI GCPProjectName string `json:"gcp-project" pflag:",Name of the project in GCP"` // Deprecated: Please use StackDriverTemplateURI - StackdriverLogResourceName string `json:"stackdriver-logresourcename" pflag:",Name of the logresource in stackdriver"` - StackDriverTemplateURI TemplateURI `json:"stackdriver-template-uri" pflag:",Template Uri to use when building stackdriver log links"` - - Templates []TemplateLogPluginConfig `json:"templates" pflag:"-,"` -} + StackdriverLogResourceName string `json:"stackdriver-logresourcename" pflag:",Name of the logresource in stackdriver"` + StackDriverTemplateURI tasklog.TemplateURI `json:"stackdriver-template-uri" pflag:",Template Uri to use when building stackdriver log links"` -type TemplateLogPluginConfig struct { - DisplayName string `json:"displayName" pflag:",Display name for the generated log when displayed in the console."` - TemplateURIs []TemplateURI `json:"templateUris" pflag:",URI Templates for generating task log links."` - MessageFormat core.TaskLog_MessageFormat `json:"messageFormat" pflag:",Log Message Format."` - Scheme tasklog.TemplateScheme `json:"scheme" pflag:",Templating scheme to use. Supported values are Pod and TaskExecution."` + Templates []tasklog.TemplateLogPlugin `json:"templates" pflag:"-,"` } var ( diff --git a/flyteplugins/go/tasks/logs/logging_utils.go b/flyteplugins/go/tasks/logs/logging_utils.go index 0ca515d7c8..4978109458 100644 --- a/flyteplugins/go/tasks/logs/logging_utils.go +++ b/flyteplugins/go/tasks/logs/logging_utils.go @@ -8,17 +8,13 @@ import ( v1 "k8s.io/api/core/v1" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginsCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flytestdlib/logger" ) -type logPlugin struct { - Name string - Plugin tasklog.Plugin -} - // Internal -func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, taskExecID *core.TaskExecutionIdentifier, pod *v1.Pod, index uint32, nameSuffix string, extraLogTemplateVarsByScheme *tasklog.TemplateVarsByScheme) ([]*core.TaskLog, error) { +func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, taskExecID pluginsCore.TaskExecutionID, pod *v1.Pod, index uint32, nameSuffix string, extraLogTemplateVarsByScheme *tasklog.TemplateVarsByScheme) ([]*core.TaskLog, error) { if logPlugin == nil { return nil, nil } @@ -53,7 +49,7 @@ func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, tas PodRFC3339FinishTime: time.Unix(finishTime, 0).Format(time.RFC3339), PodUnixStartTime: startTime, PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: taskExecID, + TaskExecutionID: taskExecID, ExtraTemplateVarsByScheme: extraLogTemplateVarsByScheme, }, ) @@ -65,67 +61,53 @@ func GetLogsForContainerInPod(ctx context.Context, logPlugin tasklog.Plugin, tas return logs.TaskLogs, nil } -type taskLogPluginWrapper struct { - logPlugins []logPlugin +type templateLogPluginCollection struct { + plugins []tasklog.TemplateLogPlugin } -func (t taskLogPluginWrapper) GetTaskLogs(input tasklog.Input) (logOutput tasklog.Output, err error) { - logs := make([]*core.TaskLog, 0, len(t.logPlugins)) - suffix := input.LogName +func (t templateLogPluginCollection) GetTaskLogs(input tasklog.Input) (tasklog.Output, error) { + var taskLogs []*core.TaskLog - for _, plugin := range t.logPlugins { - input.LogName = plugin.Name + suffix - o, err := plugin.Plugin.GetTaskLogs(input) + for _, plugin := range t.plugins { + o, err := plugin.GetTaskLogs(input) if err != nil { return tasklog.Output{}, err } - - logs = append(logs, o.TaskLogs...) + taskLogs = append(taskLogs, o.TaskLogs...) } - return tasklog.Output{ - TaskLogs: logs, - }, nil + return tasklog.Output{TaskLogs: taskLogs}, nil } // InitializeLogPlugins initializes log plugin based on config. func InitializeLogPlugins(cfg *LogConfig) (tasklog.Plugin, error) { // Use a list to maintain order. - logPlugins := make([]logPlugin, 0, 2) + var plugins []tasklog.TemplateLogPlugin if cfg.IsKubernetesEnabled { if len(cfg.KubernetesTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Kubernetes Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.KubernetesTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Kubernetes Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.KubernetesTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Kubernetes Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("%s/#!/log/{{ .namespace }}/{{ .podName }}/pod?namespace={{ .namespace }}", cfg.KubernetesURL)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Kubernetes Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("%s/#!/log/{{ .namespace }}/{{ .podName }}/pod?namespace={{ .namespace }}", cfg.KubernetesURL)}, MessageFormat: core.TaskLog_JSON}) } } if cfg.IsCloudwatchEnabled { if len(cfg.CloudwatchTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Cloudwatch Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.CloudwatchTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Cloudwatch Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.CloudwatchTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Cloudwatch Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("https://console.aws.amazon.com/cloudwatch/home?region=%s#logEventViewer:group=%s;stream=var.log.containers.{{ .podName }}_{{ .namespace }}_{{ .containerName }}-{{ .containerId }}.log", cfg.CloudwatchRegion, cfg.CloudwatchLogGroup)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Cloudwatch Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("https://console.aws.amazon.com/cloudwatch/home?region=%s#logEventViewer:group=%s;stream=var.log.containers.{{ .podName }}_{{ .namespace }}_{{ .containerName }}-{{ .containerId }}.log", cfg.CloudwatchRegion, cfg.CloudwatchLogGroup)}, MessageFormat: core.TaskLog_JSON}) } } if cfg.IsStackDriverEnabled { if len(cfg.StackDriverTemplateURI) > 0 { - logPlugins = append(logPlugins, logPlugin{Name: "Stackdriver Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{cfg.StackDriverTemplateURI}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Stackdriver Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{cfg.StackDriverTemplateURI}, MessageFormat: core.TaskLog_JSON}) } else { - logPlugins = append(logPlugins, logPlugin{Name: "Stackdriver Logs", Plugin: tasklog.NewTemplateLogPlugin(tasklog.TemplateSchemePod, []string{fmt.Sprintf("https://console.cloud.google.com/logs/viewer?project=%s&angularJsUrl=%%2Flogs%%2Fviewer%%3Fproject%%3D%s&resource=%s&advancedFilter=resource.labels.pod_name%%3D{{ .podName }}", cfg.GCPProjectName, cfg.GCPProjectName, cfg.StackdriverLogResourceName)}, core.TaskLog_JSON)}) + plugins = append(plugins, tasklog.TemplateLogPlugin{DisplayName: "Stackdriver Logs", Scheme: tasklog.TemplateSchemePod, TemplateURIs: []tasklog.TemplateURI{fmt.Sprintf("https://console.cloud.google.com/logs/viewer?project=%s&angularJsUrl=%%2Flogs%%2Fviewer%%3Fproject%%3D%s&resource=%s&advancedFilter=resource.labels.pod_name%%3D{{ .podName }}", cfg.GCPProjectName, cfg.GCPProjectName, cfg.StackdriverLogResourceName)}, MessageFormat: core.TaskLog_JSON}) } } - if len(cfg.Templates) > 0 { - for _, cfg := range cfg.Templates { - logPlugins = append(logPlugins, logPlugin{Name: cfg.DisplayName, Plugin: tasklog.NewTemplateLogPlugin(cfg.Scheme, cfg.TemplateURIs, cfg.MessageFormat)}) - } - } - - if len(logPlugins) == 0 { - return nil, nil - } - - return taskLogPluginWrapper{logPlugins: logPlugins}, nil + plugins = append(plugins, cfg.Templates...) + return templateLogPluginCollection{plugins: plugins}, nil } diff --git a/flyteplugins/go/tasks/logs/logging_utils_test.go b/flyteplugins/go/tasks/logs/logging_utils_test.go index fbf86b9933..91048eff16 100644 --- a/flyteplugins/go/tasks/logs/logging_utils_test.go +++ b/flyteplugins/go/tasks/logs/logging_utils_test.go @@ -10,34 +10,41 @@ import ( v12 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" + coreMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core/mocks" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" ) const podName = "PodName" -var dummyTaskExecID = &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", +func dummyTaskExecID() pluginCore.TaskExecutionID { + tID := &coreMocks.TaskExecutionID{} + tID.OnGetGeneratedName().Return("generated-name") + tID.OnGetID().Return(core.TaskExecutionIdentifier{ + TaskId: &core.Identifier{ + ResourceType: core.ResourceType_TASK, + Name: "my-task-name", + Project: "my-task-project", + Domain: "my-task-domain", + Version: "1", }, - }, - RetryAttempt: 1, + NodeExecutionId: &core.NodeExecutionIdentifier{ + ExecutionId: &core.WorkflowExecutionIdentifier{ + Name: "my-execution-name", + Project: "my-execution-project", + Domain: "my-execution-domain", + }, + }, + RetryAttempt: 1, + }) + tID.OnGetUniqueNodeID().Return("n0-0-n0") + return tID } func TestGetLogsForContainerInPod_NoPlugins(t *testing.T) { logPlugin, err := InitializeLogPlugins(&LogConfig{}) assert.NoError(t, err) - l, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, nil, 0, " Suffix", nil) + l, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), nil, 0, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, l) } @@ -49,7 +56,7 @@ func TestGetLogsForContainerInPod_NoLogs(t *testing.T) { CloudwatchLogGroup: "/kubernetes/flyte-production", }) assert.NoError(t, err) - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, nil, 0, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), nil, 0, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -80,7 +87,7 @@ func TestGetLogsForContainerInPod_BadIndex(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 1, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 1, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -105,7 +112,7 @@ func TestGetLogsForContainerInPod_MissingStatus(t *testing.T) { } pod.Name = podName - p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 1, " Suffix", nil) + p, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 1, " Suffix", nil) assert.NoError(t, err) assert.Nil(t, p) } @@ -135,7 +142,7 @@ func TestGetLogsForContainerInPod_Cloudwatch(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -165,7 +172,7 @@ func TestGetLogsForContainerInPod_K8s(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -198,7 +205,7 @@ func TestGetLogsForContainerInPod_All(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 2) } @@ -229,7 +236,7 @@ func TestGetLogsForContainerInPod_Stackdriver(t *testing.T) { } pod.Name = podName - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " Suffix", nil) assert.Nil(t, err) assert.Len(t, logs, 1) } @@ -303,7 +310,7 @@ func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*c }, } - logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID, pod, 0, " my-Suffix", nil) + logs, err := GetLogsForContainerInPod(context.TODO(), logPlugin, dummyTaskExecID(), pod, 0, " my-Suffix", nil) assert.Nil(tb, err) assert.Len(tb, logs, len(expectedTaskLogs)) if diff := deep.Equal(logs, expectedTaskLogs); len(diff) > 0 { @@ -313,7 +320,7 @@ func assertTestSucceeded(tb testing.TB, config *LogConfig, expectedTaskLogs []*c func TestGetLogsForContainerInPod_Templates(t *testing.T) { assertTestSucceeded(t, &LogConfig{ - Templates: []TemplateLogPluginConfig{ + Templates: []tasklog.TemplateLogPlugin{ { DisplayName: "StackDriver", TemplateURIs: []string{ @@ -337,7 +344,7 @@ func TestGetLogsForContainerInPod_Templates(t *testing.T) { Name: "StackDriver my-Suffix", }, { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/1/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "Internal my-Suffix", }, diff --git a/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go b/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go index 8517b9c385..9ac650baaa 100644 --- a/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go +++ b/flyteplugins/go/tasks/pluginmachinery/core/exec_metadata.go @@ -27,6 +27,10 @@ type TaskExecutionID interface { // GetID returns the underlying idl task identifier. GetID() core.TaskExecutionIdentifier + + // GetUniqueNodeID returns the fully-qualified Node ID that is unique within a + // given workflow execution. + GetUniqueNodeID() string } // TaskExecutionMetadata represents any execution information for a Task. It is used to communicate meta information about the diff --git a/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go b/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go index 7db5590170..44596bf82f 100644 --- a/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go +++ b/flyteplugins/go/tasks/pluginmachinery/core/mocks/task_execution_id.go @@ -114,3 +114,35 @@ func (_m *TaskExecutionID) GetID() flyteidlcore.TaskExecutionIdentifier { return r0 } + +type TaskExecutionID_GetUniqueNodeID struct { + *mock.Call +} + +func (_m TaskExecutionID_GetUniqueNodeID) Return(_a0 string) *TaskExecutionID_GetUniqueNodeID { + return &TaskExecutionID_GetUniqueNodeID{Call: _m.Call.Return(_a0)} +} + +func (_m *TaskExecutionID) OnGetUniqueNodeID() *TaskExecutionID_GetUniqueNodeID { + c_call := _m.On("GetUniqueNodeID") + return &TaskExecutionID_GetUniqueNodeID{Call: c_call} +} + +func (_m *TaskExecutionID) OnGetUniqueNodeIDMatch(matchers ...interface{}) *TaskExecutionID_GetUniqueNodeID { + c_call := _m.On("GetUniqueNodeID", matchers...) + return &TaskExecutionID_GetUniqueNodeID{Call: c_call} +} + +// GetUniqueNodeID provides a mock function with given fields: +func (_m *TaskExecutionID) GetUniqueNodeID() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go index 5703b88d81..60a3833397 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/k8s_resource_adds_test.go @@ -227,6 +227,10 @@ func (m mockTaskExecutionIdentifier) GetGeneratedName() string { return "task-exec-name" } +func (m mockTaskExecutionIdentifier) GetUniqueNodeID() string { + return "unique-node-id" +} + func TestDecorateEnvVars(t *testing.T) { ctx := context.Background() ctx = contextutils.WithWorkflowID(ctx, "fake_workflow") diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go index 8d33305806..c326a3ddd1 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -24,6 +24,7 @@ import ( const PodKind = "pod" const OOMKilled = "OOMKilled" const Interrupted = "Interrupted" +const PrimaryContainerNotFound = "PrimaryContainerNotFound" const SIGKILL = 137 const defaultContainerTemplateName = "default" @@ -746,7 +747,7 @@ func DeterminePrimaryContainerPhase(primaryContainerName string, statuses []v1.C } // If for some reason we can't find the primary container, always just return a permanent failure - return pluginsCore.PhaseInfoFailure("PrimaryContainerMissing", + return pluginsCore.PhaseInfoFailure(PrimaryContainerNotFound, fmt.Sprintf("Primary container [%s] not found in pod's container statuses", primaryContainerName), info) } diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go index 0c3f8d90f2..ec2f8f89da 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go @@ -1682,6 +1682,7 @@ func TestDeterminePrimaryContainerPhase(t *testing.T) { secondaryContainer, }, info) assert.Equal(t, pluginsCore.PhasePermanentFailure, phaseInfo.Phase()) + assert.Equal(t, PrimaryContainerNotFound, phaseInfo.Err().Code) assert.Equal(t, "Primary container [primary] not found in pod's container statuses", phaseInfo.Err().Message) }) } diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go index 0ca91c3370..c43da02e58 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/plugin.go @@ -4,6 +4,7 @@ import ( "regexp" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginsCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" ) //go:generate enumer --type=TemplateScheme --trimprefix=TemplateScheme -json -yaml @@ -15,6 +16,9 @@ const ( TemplateSchemeTaskExecution ) +// TemplateURI is a URI that accepts templates. See: go/tasks/pluginmachinery/tasklog/template.go for available templates. +type TemplateURI = string + type TemplateVar struct { Regex *regexp.Regexp Value string @@ -42,7 +46,7 @@ type Input struct { PodUnixStartTime int64 PodUnixFinishTime int64 PodUID string - TaskExecutionIdentifier *core.TaskExecutionIdentifier + TaskExecutionID pluginsCore.TaskExecutionID ExtraTemplateVarsByScheme *TemplateVarsByScheme } @@ -56,3 +60,10 @@ type Plugin interface { // Generates a TaskLog object given necessary computation information GetTaskLogs(i Input) (logs Output, err error) } + +type TemplateLogPlugin struct { + DisplayName string `json:"displayName" pflag:",Display name for the generated log when displayed in the console."` + TemplateURIs []TemplateURI `json:"templateUris" pflag:",URI Templates for generating task log links."` + MessageFormat core.TaskLog_MessageFormat `json:"messageFormat" pflag:",Log Message Format."` + Scheme TemplateScheme `json:"scheme" pflag:",Templating scheme to use. Supported values are Pod and TaskExecution."` +} diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go index 2a68f42cff..750b1972df 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template.go @@ -34,6 +34,7 @@ type templateRegexes struct { ExecutionName *regexp.Regexp ExecutionProject *regexp.Regexp ExecutionDomain *regexp.Regexp + GeneratedName *regexp.Regexp } func initDefaultRegexes() templateRegexes { @@ -58,6 +59,7 @@ func initDefaultRegexes() templateRegexes { MustCreateRegex("executionName"), MustCreateRegex("executionProject"), MustCreateRegex("executionDomain"), + MustCreateRegex("generatedName"), } } @@ -114,55 +116,59 @@ func (input Input) templateVarsForScheme(scheme TemplateScheme) TemplateVars { vars = append(vars, input.ExtraTemplateVarsByScheme.Pod...) } case TemplateSchemeTaskExecution: - if input.TaskExecutionIdentifier != nil { - vars = append(vars, TemplateVar{ + taskExecutionIdentifier := input.TaskExecutionID.GetID() + vars = append( + vars, + TemplateVar{ + defaultRegexes.NodeID, + input.TaskExecutionID.GetUniqueNodeID(), + }, + TemplateVar{ + defaultRegexes.GeneratedName, + input.TaskExecutionID.GetGeneratedName(), + }, + TemplateVar{ defaultRegexes.TaskRetryAttempt, - strconv.FormatUint(uint64(input.TaskExecutionIdentifier.RetryAttempt), 10), - }) - if input.TaskExecutionIdentifier.TaskId != nil { - vars = append( - vars, - TemplateVar{ - defaultRegexes.TaskID, - input.TaskExecutionIdentifier.TaskId.Name, - }, - TemplateVar{ - defaultRegexes.TaskVersion, - input.TaskExecutionIdentifier.TaskId.Version, - }, - TemplateVar{ - defaultRegexes.TaskProject, - input.TaskExecutionIdentifier.TaskId.Project, - }, - TemplateVar{ - defaultRegexes.TaskDomain, - input.TaskExecutionIdentifier.TaskId.Domain, - }, - ) - } - if input.TaskExecutionIdentifier.NodeExecutionId != nil { - vars = append(vars, TemplateVar{ - defaultRegexes.NodeID, - input.TaskExecutionIdentifier.NodeExecutionId.NodeId, - }) - if input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId != nil { - vars = append( - vars, - TemplateVar{ - defaultRegexes.ExecutionName, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Name, - }, - TemplateVar{ - defaultRegexes.ExecutionProject, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Project, - }, - TemplateVar{ - defaultRegexes.ExecutionDomain, - input.TaskExecutionIdentifier.NodeExecutionId.ExecutionId.Domain, - }, - ) - } - } + strconv.FormatUint(uint64(taskExecutionIdentifier.RetryAttempt), 10), + }, + ) + if taskExecutionIdentifier.TaskId != nil { + vars = append( + vars, + TemplateVar{ + defaultRegexes.TaskID, + taskExecutionIdentifier.TaskId.Name, + }, + TemplateVar{ + defaultRegexes.TaskVersion, + taskExecutionIdentifier.TaskId.Version, + }, + TemplateVar{ + defaultRegexes.TaskProject, + taskExecutionIdentifier.TaskId.Project, + }, + TemplateVar{ + defaultRegexes.TaskDomain, + taskExecutionIdentifier.TaskId.Domain, + }, + ) + } + if taskExecutionIdentifier.NodeExecutionId != nil && taskExecutionIdentifier.NodeExecutionId.ExecutionId != nil { + vars = append( + vars, + TemplateVar{ + defaultRegexes.ExecutionName, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Name, + }, + TemplateVar{ + defaultRegexes.ExecutionProject, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Project, + }, + TemplateVar{ + defaultRegexes.ExecutionDomain, + taskExecutionIdentifier.NodeExecutionId.ExecutionId.Domain, + }, + ) } if gotExtraTemplateVars { vars = append(vars, input.ExtraTemplateVarsByScheme.TaskExecution...) @@ -172,55 +178,16 @@ func (input Input) templateVarsForScheme(scheme TemplateScheme) TemplateVars { return vars } -// A simple log plugin that supports templates in urls to build the final log link. -// See `defaultRegexes` for supported templates. -type TemplateLogPlugin struct { - scheme TemplateScheme - templateUris []string - messageFormat core.TaskLog_MessageFormat -} - -func (s TemplateLogPlugin) GetTaskLog(podName, podUID, namespace, containerName, containerID, logName string, podRFC3339StartTime string, podRFC3339FinishTime string, podUnixStartTime, podUnixFinishTime int64) (core.TaskLog, error) { - o, err := s.GetTaskLogs(Input{ - LogName: logName, - Namespace: namespace, - PodName: podName, - PodUID: podUID, - ContainerName: containerName, - ContainerID: containerID, - PodRFC3339StartTime: podRFC3339StartTime, - PodRFC3339FinishTime: podRFC3339FinishTime, - PodUnixStartTime: podUnixStartTime, - PodUnixFinishTime: podUnixFinishTime, - }) - - if err != nil || len(o.TaskLogs) == 0 { - return core.TaskLog{}, err - } - - return *o.TaskLogs[0], nil -} - -func (s TemplateLogPlugin) GetTaskLogs(input Input) (Output, error) { - templateVars := input.templateVarsForScheme(s.scheme) - taskLogs := make([]*core.TaskLog, 0, len(s.templateUris)) - for _, templateURI := range s.templateUris { +func (p TemplateLogPlugin) GetTaskLogs(input Input) (Output, error) { + templateVars := input.templateVarsForScheme(p.Scheme) + taskLogs := make([]*core.TaskLog, 0, len(p.TemplateURIs)) + for _, templateURI := range p.TemplateURIs { taskLogs = append(taskLogs, &core.TaskLog{ Uri: replaceAll(templateURI, templateVars), - Name: input.LogName, - MessageFormat: s.messageFormat, + Name: p.DisplayName + input.LogName, + MessageFormat: p.MessageFormat, }) } return Output{TaskLogs: taskLogs}, nil } - -// NewTemplateLogPlugin creates a template-based log plugin with the provided template Uri and message format. -// See `defaultRegexes` for supported templates. -func NewTemplateLogPlugin(scheme TemplateScheme, templateUris []string, messageFormat core.TaskLog_MessageFormat) TemplateLogPlugin { - return TemplateLogPlugin{ - scheme: scheme, - templateUris: templateUris, - messageFormat: messageFormat, - } -} diff --git a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go index e3f03047aa..f279707a3b 100644 --- a/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/tasklog/template_test.go @@ -5,32 +5,13 @@ import ( "regexp" "testing" - "github.com/go-test/deep" "github.com/stretchr/testify/assert" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" + pluginCore "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core" + coreMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core/mocks" ) -func TestTemplateLog(t *testing.T) { - p := NewTemplateLogPlugin(TemplateSchemePod, []string{"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.podUID}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log"}, core.TaskLog_JSON) - tl, err := p.GetTaskLog( - "f-uuid-driver", - "pod-uid", - "flyteexamples-production", - "spark-kubernetes-driver", - "cri-o://abc", - "main_logs", - "2015-03-14T17:08:14+01:00", - "2021-06-15T20:47:57+02:00", - 1426349294, - 1623782877, - ) - assert.NoError(t, err) - assert.Equal(t, tl.GetName(), "main_logs") - assert.Equal(t, tl.GetMessageFormat(), core.TaskLog_JSON) - assert.Equal(t, "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.f-uuid-driver_pod-uid_flyteexamples-production_spark-kubernetes-driver-abc.log", tl.Uri) -} - // Latest Run: Benchmark_mustInitTemplateRegexes-16 45960 26914 ns/op func Benchmark_initDefaultRegexes(b *testing.B) { for i := 0; i < b.N; i++ { @@ -38,6 +19,30 @@ func Benchmark_initDefaultRegexes(b *testing.B) { } } +func dummyTaskExecID() pluginCore.TaskExecutionID { + tID := &coreMocks.TaskExecutionID{} + tID.OnGetGeneratedName().Return("generated-name") + tID.OnGetID().Return(core.TaskExecutionIdentifier{ + TaskId: &core.Identifier{ + ResourceType: core.ResourceType_TASK, + Name: "my-task-name", + Project: "my-task-project", + Domain: "my-task-domain", + Version: "1", + }, + NodeExecutionId: &core.NodeExecutionIdentifier{ + ExecutionId: &core.WorkflowExecutionIdentifier{ + Name: "my-execution-name", + Project: "my-execution-project", + Domain: "my-execution-domain", + }, + }, + RetryAttempt: 1, + }) + tID.OnGetUniqueNodeID().Return("n0-0-n0") + return tID +} + func Test_Input_templateVarsForScheme(t *testing.T) { testRegexes := struct { Foo *regexp.Regexp @@ -66,25 +71,8 @@ func Test_Input_templateVarsForScheme(t *testing.T) { PodUnixFinishTime: 12345, } taskExecutionBase := Input{ - LogName: "main_logs", - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + LogName: "main_logs", + TaskExecutionID: dummyTaskExecID(), } tests := []struct { @@ -162,12 +150,13 @@ func Test_Input_templateVarsForScheme(t *testing.T) { nil, TemplateVars{ {defaultRegexes.LogName, "main_logs"}, - {defaultRegexes.TaskRetryAttempt, "0"}, + {defaultRegexes.NodeID, "n0-0-n0"}, + {defaultRegexes.GeneratedName, "generated-name"}, + {defaultRegexes.TaskRetryAttempt, "1"}, {defaultRegexes.TaskID, "my-task-name"}, {defaultRegexes.TaskVersion, "1"}, {defaultRegexes.TaskProject, "my-task-project"}, {defaultRegexes.TaskDomain, "my-task-domain"}, - {defaultRegexes.NodeID, "n0"}, {defaultRegexes.ExecutionName, "my-execution-name"}, {defaultRegexes.ExecutionProject, "my-execution-project"}, {defaultRegexes.ExecutionDomain, "my-execution-domain"}, @@ -236,147 +225,99 @@ func Test_Input_templateVarsForScheme(t *testing.T) { } } -func Test_templateLogPlugin_Regression(t *testing.T) { - type fields struct { - templateURI string - messageFormat core.TaskLog_MessageFormat - } +func TestTemplateLogPlugin(t *testing.T) { type args struct { - podName string - podUID string - namespace string - containerName string - containerID string - logName string - podRFC3339StartTime string - podRFC3339FinishTime string - podUnixStartTime int64 - podUnixFinishTime int64 + input Input } tests := []struct { - name string - fields fields - args args - want core.TaskLog - wantErr bool + name string + plugin TemplateLogPlugin + args args + want Output }{ { "cloudwatch", - fields{ - templateURI: "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.{{.podName}}_{{.namespace}}_{{.containerName}}-{{.containerId}}.log"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "f-uuid-driver", - podUID: "pod-uid", - namespace: "flyteexamples-production", - containerName: "spark-kubernetes-driver", - containerID: "cri-o://abc", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "f-uuid-driver", + PodUID: "pod-uid", + Namespace: "flyteexamples-production", + ContainerName: "spark-kubernetes-driver", + ContainerID: "cri-o://abc", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/flyte-production/kubernetes;stream=var.log.containers.f-uuid-driver_flyteexamples-production_spark-kubernetes-driver-abc.log", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, { "stackdriver", - fields{ - templateURI: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "podName", - podUID: "pod-uid", - namespace: "flyteexamples-production", - containerName: "spark-kubernetes-driver", - containerID: "cri-o://abc", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "podName", + PodUID: "pod-uid", + Namespace: "flyteexamples-production", + ContainerName: "spark-kubernetes-driver", + ContainerID: "cri-o://abc", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3DpodName", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, { "kubernetes", - fields{ - templateURI: "https://dashboard.k8s.net/#!/log/{{.namespace}}/{{.podName}}/pod?namespace={{.namespace}}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://dashboard.k8s.net/#!/log/{{.namespace}}/{{.podName}}/pod?namespace={{.namespace}}"}, + MessageFormat: core.TaskLog_JSON, }, args{ - podName: "flyteexamples-development-task-name", - podUID: "pod-uid", - namespace: "flyteexamples-development", - containerName: "ignore", - containerID: "ignore", - logName: "main_logs", - podRFC3339StartTime: "1970-01-01T01:02:03+01:00", - podRFC3339FinishTime: "1970-01-01T04:25:45+01:00", - podUnixStartTime: 123, - podUnixFinishTime: 12345, - }, - core.TaskLog{ + input: Input{ + PodName: "flyteexamples-development-task-name", + PodUID: "pod-uid", + Namespace: "flyteexamples-development", + ContainerName: "ignore", + ContainerID: "ignore", + LogName: "main_logs", + PodRFC3339StartTime: "1970-01-01T01:02:03+01:00", + PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", + PodUnixStartTime: 123, + PodUnixFinishTime: 12345, + }, + }, + Output{TaskLogs: []*core.TaskLog{{ Uri: "https://dashboard.k8s.net/#!/log/flyteexamples-development/flyteexamples-development-task-name/pod?namespace=flyteexamples-development", MessageFormat: core.TaskLog_JSON, Name: "main_logs", - }, - false, + }}}, }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - s := TemplateLogPlugin{ - templateUris: []string{tt.fields.templateURI}, - messageFormat: tt.fields.messageFormat, - } - - got, err := s.GetTaskLog(tt.args.podName, tt.args.podUID, tt.args.namespace, tt.args.containerName, tt.args.containerID, tt.args.logName, tt.args.podRFC3339FinishTime, tt.args.podRFC3339FinishTime, tt.args.podUnixStartTime, tt.args.podUnixFinishTime) - if (err != nil) != tt.wantErr { - t.Errorf("GetTaskLog() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if diff := deep.Equal(got, tt.want); diff != nil { - t.Errorf("GetTaskLog() got = %v, want %v, diff: %v", got, tt.want, diff) - } - }) - } -} - -func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { - type fields struct { - scheme TemplateScheme - templateURI string - messageFormat core.TaskLog_MessageFormat - } - type args struct { - input Input - } - tests := []struct { - name string - fields fields - args args - want Output - wantErr bool - }{ { "splunk", - fields{ - templateURI: "https://prd-p-ighar.splunkcloud.com/en-US/app/search/search?q=search%20container_name%3D%22{{ .containerName }}%22", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://prd-p-ighar.splunkcloud.com/en-US/app/search/search?q=search%20container_name%3D%22{{ .containerName }}%22"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -401,13 +342,12 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "ddog", - fields{ - templateURI: "https://app.datadoghq.com/logs?event&from_ts={{ .podUnixStartTime }}&live=true&query=pod_name%3A{{ .podName }}&to_ts={{ .podUnixFinishTime }}", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://app.datadoghq.com/logs?event&from_ts={{ .podUnixStartTime }}&live=true&query=pod_name%3A{{ .podName }}&to_ts={{ .podUnixFinishTime }}"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -432,13 +372,12 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "stackdriver-with-rfc3339-timestamp", - fields{ - templateURI: "https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}%20%22{{.podRFC3339StartTime}}%22", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + TemplateURIs: []TemplateURI{"https://console.cloud.google.com/logs/viewer?project=test-gcp-project&angularJsUrl=%2Flogs%2Fviewer%3Fproject%3Dtest-gcp-project&resource=aws_ec2_instance&advancedFilter=resource.labels.pod_name%3D{{.podName}}%20%22{{.podRFC3339StartTime}}%22"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -463,14 +402,13 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { }, }, }, - false, }, { "task-with-task-execution-identifier", - fields{ - scheme: TemplateSchemeTaskExecution, - templateURI: "https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .taskRetryAttempt }}/view/logs", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + Scheme: TemplateSchemeTaskExecution, + TemplateURIs: []TemplateURI{"https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .taskRetryAttempt }}/view/logs"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -484,43 +422,25 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", PodUnixStartTime: 123, PodUnixFinishTime: 12345, - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + TaskExecutionID: dummyTaskExecID(), }, }, Output{ TaskLogs: []*core.TaskLog{ { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/0/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "main_logs", }, }, }, - false, }, { "mapped-task-with-task-execution-identifier", - fields{ - scheme: TemplateSchemeTaskExecution, - templateURI: "https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .subtaskParentRetryAttempt }}/mappedIndex/{{ .subtaskExecutionIndex }}/mappedAttempt/{{ .subtaskRetryAttempt }}/view/logs", - messageFormat: core.TaskLog_JSON, + TemplateLogPlugin{ + Scheme: TemplateSchemeTaskExecution, + TemplateURIs: []TemplateURI{"https://flyte.corp.net/console/projects/{{ .executionProject }}/domains/{{ .executionDomain }}/executions/{{ .executionName }}/nodeId/{{ .nodeID }}/taskId/{{ .taskID }}/attempt/{{ .subtaskParentRetryAttempt }}/mappedIndex/{{ .subtaskExecutionIndex }}/mappedAttempt/{{ .subtaskRetryAttempt }}/view/logs"}, + MessageFormat: core.TaskLog_JSON, }, args{ input: Input{ @@ -534,24 +454,7 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { PodRFC3339FinishTime: "1970-01-01T04:25:45+01:00", PodUnixStartTime: 123, PodUnixFinishTime: 12345, - TaskExecutionIdentifier: &core.TaskExecutionIdentifier{ - TaskId: &core.Identifier{ - ResourceType: core.ResourceType_TASK, - Name: "my-task-name", - Project: "my-task-project", - Domain: "my-task-domain", - Version: "1", - }, - NodeExecutionId: &core.NodeExecutionIdentifier{ - NodeId: "n0", - ExecutionId: &core.WorkflowExecutionIdentifier{ - Name: "my-execution-name", - Project: "my-execution-project", - Domain: "my-execution-domain", - }, - }, - RetryAttempt: 0, - }, + TaskExecutionID: dummyTaskExecID(), ExtraTemplateVarsByScheme: &TemplateVarsByScheme{ TaskExecution: TemplateVars{ {MustCreateRegex("subtaskExecutionIndex"), "1"}, @@ -564,29 +467,20 @@ func TestTemplateLogPlugin_NewTaskLog(t *testing.T) { Output{ TaskLogs: []*core.TaskLog{ { - Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0/taskId/my-task-name/attempt/0/mappedIndex/1/mappedAttempt/1/view/logs", + Uri: "https://flyte.corp.net/console/projects/my-execution-project/domains/my-execution-domain/executions/my-execution-name/nodeId/n0-0-n0/taskId/my-task-name/attempt/0/mappedIndex/1/mappedAttempt/1/view/logs", MessageFormat: core.TaskLog_JSON, Name: "main_logs", }, }, }, - false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - s := TemplateLogPlugin{ - scheme: tt.fields.scheme, - templateUris: []string{tt.fields.templateURI}, - messageFormat: tt.fields.messageFormat, - } - got, err := s.GetTaskLogs(tt.args.input) - if (err != nil) != tt.wantErr { - t.Errorf("NewTaskLog() error = %v, wantErr %v", err, tt.wantErr) - return - } + got, err := tt.plugin.GetTaskLogs(tt.args.input) + assert.NoError(t, err) if !reflect.DeepEqual(got, tt.want) { - t.Errorf("NewTaskLog() got = %v, want %v", got, tt.want) + t.Errorf("GetTaskLogs() got = %v, want %v", got, tt.want) } }) } diff --git a/flyteplugins/go/tasks/plugins/array/catalog.go b/flyteplugins/go/tasks/plugins/array/catalog.go index 8544555609..836ff52fda 100644 --- a/flyteplugins/go/tasks/plugins/array/catalog.go +++ b/flyteplugins/go/tasks/plugins/array/catalog.go @@ -74,11 +74,10 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex return state, errors.Errorf(errors.MetadataAccessFailed, "Could not read inputs and therefore failed to determine array job size") } + // identify and validate the size of the array job size := -1 var literalCollection *idlCore.LiteralCollection - literals := make([][]*idlCore.Literal, 0) - discoveredInputNames := make([]string, 0) - for inputName, literal := range inputs.Literals { + for _, literal := range inputs.Literals { if literalCollection = literal.GetCollection(); literalCollection != nil { // validate length of input list if size != -1 && size != len(literalCollection.Literals) { @@ -86,9 +85,6 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex return state, nil } - literals = append(literals, literalCollection.Literals) - discoveredInputNames = append(discoveredInputNames, inputName) - size = len(literalCollection.Literals) } } @@ -110,7 +106,7 @@ func DetermineDiscoverability(ctx context.Context, tCtx core.TaskExecutionContex arrayJobSize = int64(size) // build input readers - inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), literals, discoveredInputNames) + inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), inputs.Literals, size) } if arrayJobSize > maxArrayJobSize { @@ -246,18 +242,7 @@ func WriteToDiscovery(ctx context.Context, tCtx core.TaskExecutionContext, state return state, externalResources, errors.Errorf(errors.MetadataAccessFailed, "Could not read inputs and therefore failed to determine array job size") } - var literalCollection *idlCore.LiteralCollection - literals := make([][]*idlCore.Literal, 0) - discoveredInputNames := make([]string, 0) - for inputName, literal := range inputs.Literals { - if literalCollection = literal.GetCollection(); literalCollection != nil { - literals = append(literals, literalCollection.Literals) - discoveredInputNames = append(discoveredInputNames, inputName) - } - } - - // build input readers - inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), literals, discoveredInputNames) + inputReaders = ConstructStaticInputReaders(tCtx.InputReader(), inputs.Literals, arrayJobSize) } // output reader @@ -476,16 +461,19 @@ func ConstructCatalogReaderWorkItems(ctx context.Context, taskReader core.TaskRe // ConstructStaticInputReaders constructs input readers that comply with the io.InputReader interface but have their // inputs already populated. -func ConstructStaticInputReaders(inputPaths io.InputFilePaths, inputs [][]*idlCore.Literal, inputNames []string) []io.InputReader { - inputReaders := make([]io.InputReader, 0, len(inputs)) - if len(inputs) == 0 { - return inputReaders - } +func ConstructStaticInputReaders(inputPaths io.InputFilePaths, inputLiterals map[string]*idlCore.Literal, arrayJobSize int) []io.InputReader { + var literalCollection *idlCore.LiteralCollection - for i := 0; i < len(inputs[0]); i++ { + inputReaders := make([]io.InputReader, 0, arrayJobSize) + for i := 0; i < arrayJobSize; i++ { literals := make(map[string]*idlCore.Literal) - for j := 0; j < len(inputNames); j++ { - literals[inputNames[j]] = inputs[j][i] + for inputName, inputLiteral := range inputLiterals { + if literalCollection = inputLiteral.GetCollection(); literalCollection != nil { + // if literal is a collection then we need to retrieve the specific literal for this subtask index + literals[inputName] = literalCollection.Literals[i] + } else { + literals[inputName] = inputLiteral + } } inputReaders = append(inputReaders, NewStaticInputReader(inputPaths, &idlCore.LiteralMap{Literals: literals})) diff --git a/flyteplugins/go/tasks/plugins/array/core/state.go b/flyteplugins/go/tasks/plugins/array/core/state.go index 665eec309f..f601ec375b 100644 --- a/flyteplugins/go/tasks/plugins/array/core/state.go +++ b/flyteplugins/go/tasks/plugins/array/core/state.go @@ -273,7 +273,10 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) return PhaseWaitingForResources } - if totalSuccesses >= minSuccesses && totalRunning == 0 { + + // if we have enough successes, ensure all tasks are in a terminal phase (success or failure) + // before transitioning to the next phase. + if totalSuccesses >= minSuccesses && totalSuccesses+totalPermanentFailures == totalCount { logger.Infof(ctx, "Array succeeded because totalSuccesses[%v] >= minSuccesses[%v]", totalSuccesses, minSuccesses) return PhaseWriteToDiscovery } diff --git a/flyteplugins/go/tasks/plugins/array/core/state_test.go b/flyteplugins/go/tasks/plugins/array/core/state_test.go index 26a80531b5..01b5b41528 100644 --- a/flyteplugins/go/tasks/plugins/array/core/state_test.go +++ b/flyteplugins/go/tasks/plugins/array/core/state_test.go @@ -349,6 +349,24 @@ func TestSummaryToPhase(t *testing.T) { core.PhaseRetryableFailure: 5, }, }, + { + // complete retry even though minSuccesses is achieved + "RetryMinSuccessRatio", + PhaseCheckingSubTaskExecutions, + map[core.Phase]int64{ + core.PhaseSuccess: 10, + core.PhaseRetryableFailure: 1, + }, + }, + { + // ensure all tasks are executed even if minSuccesses is achieved + "ExecuteAllMinSuccessRatio", + PhaseCheckingSubTaskExecutions, + map[core.Phase]int64{ + core.PhaseSuccess: 10, + core.PhaseUndefined: 1, + }, + }, } for _, tt := range tests { diff --git a/flyteplugins/go/tasks/plugins/k8s/dask/dask.go b/flyteplugins/go/tasks/plugins/k8s/dask/dask.go index 65050f5bb2..eb27aec3ce 100644 --- a/flyteplugins/go/tasks/plugins/k8s/dask/dask.go +++ b/flyteplugins/go/tasks/plugins/k8s/dask/dask.go @@ -298,13 +298,13 @@ func (p daskResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s status == daskAPI.DaskJobClusterCreated if !isQueued { - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() o, err := logPlugin.GetTaskLogs( tasklog.Input{ - Namespace: job.ObjectMeta.Namespace, - PodName: job.Status.JobRunnerPodName, - LogName: "(User logs)", - TaskExecutionIdentifier: &taskExecID, + Namespace: job.ObjectMeta.Namespace, + PodName: job.Status.JobRunnerPodName, + LogName: "(User logs)", + TaskExecutionID: taskExecID, }, ) if err != nil { diff --git a/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go b/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go index e0903d02a3..594767b4b4 100644 --- a/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go +++ b/flyteplugins/go/tasks/plugins/k8s/kfoperators/common/common_operator.go @@ -98,7 +98,7 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v namespace := objectMeta.Namespace taskLogs := make([]*core.TaskLog, 0, 10) - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() logPlugin, err := logs.InitializeLogPlugins(logs.GetLogConfig()) @@ -120,14 +120,14 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v if taskType == PytorchTaskType && hasMaster { masterTaskLog, masterErr := logPlugin.GetTaskLogs( tasklog.Input{ - PodName: name + "-master-0", - Namespace: namespace, - LogName: "master", - PodRFC3339StartTime: RFC3999StartTime, - PodRFC3339FinishTime: RFC3999FinishTime, - PodUnixStartTime: startTime, - PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: &taskExecID, + PodName: name + "-master-0", + Namespace: namespace, + LogName: "master", + PodRFC3339StartTime: RFC3999StartTime, + PodRFC3339FinishTime: RFC3999FinishTime, + PodUnixStartTime: startTime, + PodUnixFinishTime: finishTime, + TaskExecutionID: taskExecID, }, ) if masterErr != nil { @@ -139,13 +139,13 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get all workers log for workerIndex := int32(0); workerIndex < workersCount; workerIndex++ { workerLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-worker-%d", workerIndex), - Namespace: namespace, - PodRFC3339StartTime: RFC3999StartTime, - PodRFC3339FinishTime: RFC3999FinishTime, - PodUnixStartTime: startTime, - PodUnixFinishTime: finishTime, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-worker-%d", workerIndex), + Namespace: namespace, + PodRFC3339StartTime: RFC3999StartTime, + PodRFC3339FinishTime: RFC3999FinishTime, + PodUnixStartTime: startTime, + PodUnixFinishTime: finishTime, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -160,9 +160,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get all parameter servers logs for psReplicaIndex := int32(0); psReplicaIndex < psReplicasCount; psReplicaIndex++ { psReplicaLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-psReplica-%d", psReplicaIndex), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-psReplica-%d", psReplicaIndex), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -172,9 +172,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get chief worker log, and the max number of chief worker is 1 if chiefReplicasCount != 0 { chiefReplicaLog, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-chiefReplica-%d", 0), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-chiefReplica-%d", 0), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err @@ -184,9 +184,9 @@ func GetLogs(pluginContext k8s.PluginContext, taskType string, objectMeta meta_v // get evaluator log, and the max number of evaluator is 1 if evaluatorReplicasCount != 0 { evaluatorReplicasCount, err := logPlugin.GetTaskLogs(tasklog.Input{ - PodName: name + fmt.Sprintf("-evaluatorReplica-%d", 0), - Namespace: namespace, - TaskExecutionIdentifier: &taskExecID, + PodName: name + fmt.Sprintf("-evaluatorReplica-%d", 0), + Namespace: namespace, + TaskExecutionID: taskExecID, }) if err != nil { return nil, err diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go index d21eefb8b9..11de877021 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/plugin.go @@ -164,9 +164,9 @@ func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.Plugin ReportedAt: &reportedAt, } - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() if pod.Status.Phase != v1.PodPending && pod.Status.Phase != v1.PodUnknown { - taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, &taskExecID, pod, 0, logSuffix, extraLogTemplateVarsByScheme) + taskLogs, err := logs.GetLogsForContainerInPod(ctx, logPlugin, taskExecID, pod, 0, logSuffix, extraLogTemplateVarsByScheme) if err != nil { return pluginsCore.PhaseInfoUndefined, err } @@ -211,7 +211,17 @@ func (plugin) GetTaskPhaseWithLogs(ctx context.Context, pluginContext k8s.Plugin } else { // if the primary container annotation exists, we use the status of the specified container phaseInfo = flytek8s.DeterminePrimaryContainerPhase(primaryContainerName, pod.Status.ContainerStatuses, &info) - if phaseInfo.Phase() == pluginsCore.PhaseRunning && len(info.Logs) > 0 { + if phaseInfo.Phase() == pluginsCore.PhasePermanentFailure && phaseInfo.Err() != nil && + phaseInfo.Err().GetCode() == flytek8s.PrimaryContainerNotFound { + // if the primary container status is not found ensure that the primary container exists. + // note: it should be impossible for the primary container to not exist at this point. + for _, container := range pod.Spec.Containers { + if container.Name == primaryContainerName { + phaseInfo = pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, &info) + break + } + } + } else if phaseInfo.Phase() == pluginsCore.PhaseRunning && len(info.Logs) > 0 { phaseInfo = phaseInfo.WithVersion(pluginsCore.DefaultPhaseVersion + 1) } } diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go b/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go index 48a000de17..0c728780d9 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/sidecar_test.go @@ -844,6 +844,13 @@ func TestDemystifiedSidecarStatus_PrimaryRunning(t *testing.T) { func TestDemystifiedSidecarStatus_PrimaryMissing(t *testing.T) { res := &v1.Pod{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Secondary", + }, + }, + }, Status: v1.PodStatus{ Phase: v1.PodRunning, ContainerStatuses: []v1.ContainerStatus{ @@ -862,6 +869,33 @@ func TestDemystifiedSidecarStatus_PrimaryMissing(t *testing.T) { assert.Equal(t, pluginsCore.PhasePermanentFailure, phaseInfo.Phase()) } +func TestDemystifiedSidecarStatus_PrimaryNotExistsYet(t *testing.T) { + res := &v1.Pod{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Primary", + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "Secondary", + }, + }, + }, + } + res.SetAnnotations(map[string]string{ + flytek8s.PrimaryContainerKey: "Primary", + }) + taskCtx := getDummySidecarTaskContext(&core.TaskTemplate{}, sidecarResourceRequirements, nil) + phaseInfo, err := DefaultPodPlugin.GetTaskPhase(context.TODO(), taskCtx, res) + assert.Nil(t, err) + assert.Equal(t, pluginsCore.PhaseRunning, phaseInfo.Phase()) +} + func TestGetProperties(t *testing.T) { expected := k8s.PluginProperties{} assert.Equal(t, expected, DefaultPodPlugin.GetProperties()) diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/config.go b/flyteplugins/go/tasks/plugins/k8s/ray/config.go index 10a8068344..8601264edf 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/config.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/config.go @@ -3,9 +3,12 @@ package ray import ( "context" + v1 "k8s.io/api/core/v1" + pluginsConfig "github.com/flyteorg/flyte/flyteplugins/go/tasks/config" "github.com/flyteorg/flyte/flyteplugins/go/tasks/logs" pluginmachinery "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s" + "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flytestdlib/config" ) @@ -76,10 +79,12 @@ type Config struct { DeprecatedNodeIPAddress string `json:"nodeIPAddress,omitempty" pflag:"-,DEPRECATED. Please use DefaultConfig.[HeadNode|WorkerNode].IPAddress"` // Remote Ray Cluster Config - RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"` - Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"` - Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"` - EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"` + RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"` + Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"` + LogsSidecar *v1.Container `json:"logsSidecar" pflag:"-,Sidecar to inject into head pods for capturing ray job logs"` + DashboardURLTemplate *tasklog.TemplateLogPlugin `json:"dashboardURLTemplate" pflag:",Template for URL of Ray dashboard running on a head node."` + Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"` + EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"` } type DefaultConfig struct { diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go index 22840aa909..0bc4f1183b 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray.go @@ -13,6 +13,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/plugins" flyteerr "github.com/flyteorg/flyte/flyteplugins/go/tasks/errors" "github.com/flyteorg/flyte/flyteplugins/go/tasks/logs" @@ -26,6 +27,8 @@ import ( ) const ( + rayStateMountPath = "/tmp/ray" + defaultRayStateVolName = "system-ray-state" rayTaskType = "ray" KindRayJob = "RayJob" IncludeDashboard = "include-dashboard" @@ -61,17 +64,18 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to create pod spec: [%v]", err.Error()) } - var container v1.Container - found := false - for _, c := range podSpec.Containers { + var primaryContainer *v1.Container + var primaryContainerIdx int + for idx, c := range podSpec.Containers { if c.Name == primaryContainerName { - container = c - found = true + c := c + primaryContainer = &c + primaryContainerIdx = idx break } } - if !found { + if primaryContainer == nil { return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to get primary container from the pod: [%v]", err.Error()) } @@ -101,9 +105,15 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC } enableIngress := true + headPodSpec := podSpec.DeepCopy() rayClusterSpec := rayv1alpha1.RayClusterSpec{ HeadGroupSpec: rayv1alpha1.HeadGroupSpec{ - Template: buildHeadPodTemplate(&container, podSpec, objectMeta, taskCtx), + Template: buildHeadPodTemplate( + &headPodSpec.Containers[primaryContainerIdx], + headPodSpec, + objectMeta, + taskCtx, + ), ServiceType: v1.ServiceType(cfg.ServiceType), Replicas: &headReplicas, EnableIngress: &enableIngress, @@ -113,7 +123,13 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC } for _, spec := range rayJob.RayCluster.WorkerGroupSpec { - workerPodTemplate := buildWorkerPodTemplate(&container, podSpec, objectMeta, taskCtx) + workerPodSpec := podSpec.DeepCopy() + workerPodTemplate := buildWorkerPodTemplate( + &workerPodSpec.Containers[primaryContainerIdx], + workerPodSpec, + objectMeta, + taskCtx, + ) minReplicas := spec.Replicas maxReplicas := spec.Replicas @@ -161,7 +177,7 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC jobSpec := rayv1alpha1.RayJobSpec{ RayClusterSpec: rayClusterSpec, - Entrypoint: strings.Join(container.Args, " "), + Entrypoint: strings.Join(primaryContainer.Args, " "), ShutdownAfterJobFinishes: cfg.ShutdownAfterJobFinishes, TTLSecondsAfterFinished: &cfg.TTLSecondsAfterFinished, RuntimeEnv: rayJob.RuntimeEnv, @@ -179,10 +195,66 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC return &rayJobObject, nil } -func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMeta *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { +func injectLogsSidecar(primaryContainer *v1.Container, podSpec *v1.PodSpec) { + cfg := GetConfig() + if cfg.LogsSidecar == nil { + return + } + sidecar := cfg.LogsSidecar.DeepCopy() + + // Ray logs integration + var rayStateVolMount *v1.VolumeMount + // Look for an existing volume mount on the primary container, mounted at /tmp/ray + for _, vm := range primaryContainer.VolumeMounts { + if vm.MountPath == rayStateMountPath { + vm := vm + rayStateVolMount = &vm + break + } + } + // No existing volume mount exists at /tmp/ray. We create a new volume and volume + // mount and add it to the pod and container specs respectively + if rayStateVolMount == nil { + vol := v1.Volume{ + Name: defaultRayStateVolName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{}, + }, + } + podSpec.Volumes = append(podSpec.Volumes, vol) + volMount := v1.VolumeMount{ + Name: defaultRayStateVolName, + MountPath: rayStateMountPath, + } + primaryContainer.VolumeMounts = append(primaryContainer.VolumeMounts, volMount) + rayStateVolMount = &volMount + } + // We need to mirror the ray state volume mount into the sidecar as readonly, + // so that we can read the logs written by the head node. + readOnlyRayStateVolMount := *rayStateVolMount.DeepCopy() + readOnlyRayStateVolMount.ReadOnly = true + + // Update volume mounts on sidecar + // If one already exists with the desired mount path, simply replace it. Otherwise, + // add it to sidecar's volume mounts. + foundExistingSidecarVolMount := false + for idx, vm := range sidecar.VolumeMounts { + if vm.MountPath == rayStateMountPath { + foundExistingSidecarVolMount = true + sidecar.VolumeMounts[idx] = readOnlyRayStateVolMount + } + } + if !foundExistingSidecarVolMount { + sidecar.VolumeMounts = append(sidecar.VolumeMounts, readOnlyRayStateVolMount) + } + + // Add sidecar to containers + podSpec.Containers = append(podSpec.Containers, *sidecar) +} + +func buildHeadPodTemplate(primaryContainer *v1.Container, podSpec *v1.PodSpec, objectMeta *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { // Some configs are copy from https://github.com/ray-project/kuberay/blob/b72e6bdcd9b8c77a9dc6b5da8560910f3a0c3ffd/apiserver/pkg/util/cluster.go#L97 // They should always be the same, so we could hard code here. - primaryContainer := container.DeepCopy() primaryContainer.Name = "ray-head" envs := []v1.EnvVar{ @@ -217,12 +289,11 @@ func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMe primaryContainer.Ports = append(primaryContainer.Ports, ports...) - headPodSpec := podSpec.DeepCopy() - - headPodSpec.Containers = []v1.Container{*primaryContainer} + // Inject a sidecar for capturing and exposing Ray job logs + injectLogsSidecar(primaryContainer, podSpec) podTemplateSpec := v1.PodTemplateSpec{ - Spec: *headPodSpec, + Spec: *podSpec, ObjectMeta: *objectMeta, } cfg := config.GetK8sPluginConfig() @@ -231,7 +302,7 @@ func buildHeadPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMe return podTemplateSpec } -func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, objectMetadata *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { +func buildWorkerPodTemplate(primaryContainer *v1.Container, podSpec *v1.PodSpec, objectMetadata *metav1.ObjectMeta, taskCtx pluginsCore.TaskExecutionContext) v1.PodTemplateSpec { // Some configs are copy from https://github.com/ray-project/kuberay/blob/b72e6bdcd9b8c77a9dc6b5da8560910f3a0c3ffd/apiserver/pkg/util/cluster.go#L185 // They should always be the same, so we could hard code here. initContainers := []v1.Container{ @@ -243,10 +314,11 @@ func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, object "-c", "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", }, - Resources: container.Resources, + Resources: primaryContainer.Resources, }, } - primaryContainer := container.DeepCopy() + podSpec.InitContainers = append(podSpec.InitContainers, initContainers...) + primaryContainer.Name = "ray-worker" primaryContainer.Args = []string{} @@ -342,12 +414,8 @@ func buildWorkerPodTemplate(container *v1.Container, podSpec *v1.PodSpec, object } primaryContainer.Ports = append(primaryContainer.Ports, ports...) - workerPodSpec := podSpec.DeepCopy() - workerPodSpec.Containers = []v1.Container{*primaryContainer} - workerPodSpec.InitContainers = initContainers - podTemplateSpec := v1.PodTemplateSpec{ - Spec: *workerPodSpec, + Spec: *podSpec, ObjectMeta: *objectMetadata, } podTemplateSpec.SetLabels(utils.UnionMaps(podTemplateSpec.GetLabels(), utils.CopyMap(taskCtx.TaskExecutionMetadata().GetLabels()))) @@ -370,26 +438,35 @@ func getEventInfoForRayJob(logConfig logs.LogConfig, pluginContext k8s.PluginCon return nil, fmt.Errorf("failed to initialize log plugins. Error: %w", err) } - if logPlugin == nil { - return nil, nil + var taskLogs []*core.TaskLog + + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() + input := tasklog.Input{ + Namespace: rayJob.Namespace, + TaskExecutionID: taskExecID, } // TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs // RayJob CRD does not include the name of the worker or head pod for now - - taskID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() - logOutput, err := logPlugin.GetTaskLogs(tasklog.Input{ - Namespace: rayJob.Namespace, - TaskExecutionIdentifier: &taskID, - }) - + logOutput, err := logPlugin.GetTaskLogs(input) if err != nil { return nil, fmt.Errorf("failed to generate task logs. Error: %w", err) } + taskLogs = append(taskLogs, logOutput.TaskLogs...) + + // Handling for Ray Dashboard + dashboardURLTemplate := GetConfig().DashboardURLTemplate + if dashboardURLTemplate != nil && + rayJob.Status.DashboardURL != "" && + rayJob.Status.JobStatus == rayv1alpha1.JobStatusRunning { + dashboardURLOutput, err := dashboardURLTemplate.GetTaskLogs(input) + if err != nil { + return nil, fmt.Errorf("failed to generate Ray dashboard link. Error: %w", err) + } + taskLogs = append(taskLogs, dashboardURLOutput.TaskLogs...) + } - return &pluginsCore.TaskInfo{ - Logs: logOutput.TaskLogs, - }, nil + return &pluginsCore.TaskInfo{Logs: taskLogs}, nil } func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) { @@ -403,7 +480,7 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "Scheduling"), nil } - // Kuberay creates a Ray cluster first, and then submits a Ray job to the cluster + // KubeRay creates a Ray cluster first, and then submits a Ray job to the cluster switch rayJob.Status.JobDeploymentStatus { case rayv1alpha1.JobDeploymentStatusInitializing: return pluginsCore.PhaseInfoInitializing(rayJob.CreationTimestamp.Time, pluginsCore.DefaultPhaseVersion, "cluster is creating", info), nil @@ -413,7 +490,7 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont case rayv1alpha1.JobDeploymentStatusFailedJobDeploy: reason := fmt.Sprintf("Failed to submit Ray job %s with error: %s", rayJob.Name, rayJob.Status.Message) return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil - case rayv1alpha1.JobDeploymentStatusWaitForDashboard: + case rayv1alpha1.JobDeploymentStatusWaitForDashboard, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus: return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil case rayv1alpha1.JobDeploymentStatusRunning, rayv1alpha1.JobDeploymentStatusComplete: switch rayJob.Status.JobStatus { @@ -422,12 +499,27 @@ func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginCont return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil case rayv1alpha1.JobStatusSucceeded: return pluginsCore.PhaseInfoSuccess(info), nil - case rayv1alpha1.JobStatusPending, rayv1alpha1.JobStatusRunning: + case rayv1alpha1.JobStatusPending: return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil + case rayv1alpha1.JobStatusRunning: + phaseInfo := pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info) + if len(info.Logs) > 0 { + phaseInfo = phaseInfo.WithVersion(pluginsCore.DefaultPhaseVersion + 1) + } + return phaseInfo, nil + case rayv1alpha1.JobStatusStopped: + // There is no current usage of this job status in KubeRay. It's unclear what it represents + fallthrough + default: + // We already handle all known job status, so this should never happen unless a future version of ray + // introduced a new job status. + return pluginsCore.PhaseInfoUndefined, fmt.Errorf("unknown job status: %s", rayJob.Status.JobStatus) } + default: + // We already handle all known deployment status, so this should never happen unless a future version of ray + // introduced a new job status. + return pluginsCore.PhaseInfoUndefined, fmt.Errorf("unknown job deployment status: %s", rayJob.Status.JobDeploymentStatus) } - - return pluginsCore.PhaseInfoUndefined, nil } func init() { diff --git a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go index 6e3253776e..ccb518fa03 100644 --- a/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/ray/ray_test.go @@ -5,7 +5,6 @@ import ( "testing" "time" - "github.com/golang/protobuf/jsonpb" structpb "github.com/golang/protobuf/ptypes/struct" rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" "github.com/stretchr/testify/assert" @@ -25,6 +24,7 @@ import ( pluginIOMocks "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/io/mocks" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s" mocks2 "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/k8s/mocks" + "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/tasklog" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/utils" ) @@ -56,28 +56,36 @@ var ( workerGroupName = "worker-group" ) -func dummyRayCustomObj() *plugins.RayJob { - return &plugins.RayJob{ - RayCluster: &plugins.RayCluster{ - HeadGroupSpec: &plugins.HeadGroupSpec{RayStartParams: map[string]string{"num-cpus": "1"}}, - WorkerGroupSpec: []*plugins.WorkerGroupSpec{{GroupName: workerGroupName, Replicas: 3}}, - }, +func transformRayJobToCustomObj(rayJob *plugins.RayJob) *structpb.Struct { + structObj, err := utils.MarshalObjToStruct(rayJob) + if err != nil { + panic(err) } + return structObj } -func dummyRayTaskTemplate(id string, rayJobObj *plugins.RayJob) *core.TaskTemplate { - ptObjJSON, err := utils.MarshalToString(rayJobObj) +func transformPodSpecToTaskTemplateTarget(podSpec *corev1.PodSpec) *core.TaskTemplate_K8SPod { + structObj, err := utils.MarshalObjToStruct(&podSpec) if err != nil { panic(err) } + return &core.TaskTemplate_K8SPod{ + K8SPod: &core.K8SPod{ + PodSpec: structObj, + }, + } +} - structObj := structpb.Struct{} - - err = jsonpb.UnmarshalString(ptObjJSON, &structObj) - if err != nil { - panic(err) +func dummyRayCustomObj() *plugins.RayJob { + return &plugins.RayJob{ + RayCluster: &plugins.RayCluster{ + HeadGroupSpec: &plugins.HeadGroupSpec{RayStartParams: map[string]string{"num-cpus": "1"}}, + WorkerGroupSpec: []*plugins.WorkerGroupSpec{{GroupName: workerGroupName, Replicas: 3}}, + }, } +} +func dummyRayTaskTemplate(id string, rayJob *plugins.RayJob) *core.TaskTemplate { return &core.TaskTemplate{ Id: &core.Identifier{Name: id}, Type: "container", @@ -88,7 +96,7 @@ func dummyRayTaskTemplate(id string, rayJobObj *plugins.RayJob) *core.TaskTempla Env: dummyEnvVars, }, }, - Custom: &structObj, + Custom: transformRayJobToCustomObj(rayJob), } } @@ -198,7 +206,7 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { GpuResourceName: flytek8s.ResourceNvidiaGPU, })) - fixtures := []struct { + params := []struct { name string resources *corev1.ResourceRequirements extendedResourcesBase *core.ExtendedResources @@ -292,11 +300,11 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { }, } - for _, f := range fixtures { - t.Run(f.name, func(t *testing.T) { + for _, p := range params { + t.Run(p.name, func(t *testing.T) { taskTemplate := dummyRayTaskTemplate("ray-id", dummyRayCustomObj()) - taskTemplate.ExtendedResources = f.extendedResourcesBase - taskContext := dummyRayTaskContext(taskTemplate, f.resources, f.extendedResourcesOverride) + taskTemplate.ExtendedResources = p.extendedResourcesBase + taskContext := dummyRayTaskContext(taskTemplate, p.resources, p.extendedResourcesOverride) rayJobResourceHandler := rayJobResourceHandler{} r, err := rayJobResourceHandler.BuildResource(context.TODO(), taskContext) assert.Nil(t, err) @@ -308,12 +316,12 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { headNodeSpec := rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec assert.EqualValues( t, - f.expectedNsr, + p.expectedNsr, headNodeSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, ) assert.EqualValues( t, - f.expectedTol, + p.expectedTol, headNodeSpec.Tolerations, ) @@ -321,12 +329,12 @@ func TestBuildResourceRayExtendedResources(t *testing.T) { workerNodeSpec := rayJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec assert.EqualValues( t, - f.expectedNsr, + p.expectedNsr, workerNodeSpec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, ) assert.EqualValues( t, - f.expectedTol, + p.expectedTol, workerNodeSpec.Tolerations, ) }) @@ -382,6 +390,219 @@ func TestDefaultStartParameters(t *testing.T) { assert.Equal(t, ray.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Tolerations, toleration) } +func TestInjectLogsSidecar(t *testing.T) { + rayJobObj := transformRayJobToCustomObj(dummyRayCustomObj()) + params := []struct { + name string + taskTemplate core.TaskTemplate + // primaryContainerName string + logsSidecarCfg *corev1.Container + expectedVolumes []corev1.Volume + expectedPrimaryContainerVolumeMounts []corev1.VolumeMount + expectedLogsSidecarVolumeMounts []corev1.VolumeMount + }{ + { + "container target", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: &core.TaskTemplate_Container{ + Container: &core.Container{ + Image: testImage, + Args: testArgs, + }, + }, + Custom: rayJobObj, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "system-ray-state", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + { + "container target with no sidecar", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: &core.TaskTemplate_Container{ + Container: &core.Container{ + Image: testImage, + Args: testArgs, + }, + }, + Custom: rayJobObj, + }, + nil, + nil, + nil, + nil, + }, + { + "pod target", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: transformPodSpecToTaskTemplateTarget(&corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "primary-image", + }, + }, + }), + Custom: rayJobObj, + Config: map[string]string{ + flytek8s.PrimaryContainerKey: "main", + }, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "system-ray-state", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "system-ray-state", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + { + "pod target with existing ray state volume", + core.TaskTemplate{ + Id: &core.Identifier{Name: "ray-id"}, + Target: transformPodSpecToTaskTemplateTarget(&corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "primary-image", + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test-vol", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + }), + Custom: rayJobObj, + Config: map[string]string{ + flytek8s.PrimaryContainerKey: "main", + }, + }, + &corev1.Container{ + Name: "logs-sidecar", + Image: "test-image", + }, + []corev1.Volume{ + { + Name: "test-vol", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + }, + }, + []corev1.VolumeMount{ + { + Name: "test-vol", + MountPath: "/tmp/ray", + ReadOnly: true, + }, + }, + }, + } + + for _, p := range params { + t.Run(p.name, func(t *testing.T) { + assert.NoError(t, SetConfig(&Config{ + LogsSidecar: p.logsSidecarCfg, + })) + taskContext := dummyRayTaskContext(&p.taskTemplate, resourceRequirements, nil) + rayJobResourceHandler := rayJobResourceHandler{} + r, err := rayJobResourceHandler.BuildResource(context.TODO(), taskContext) + assert.Nil(t, err) + assert.NotNil(t, r) + rayJob, ok := r.(*rayv1alpha1.RayJob) + assert.True(t, ok) + + headPodSpec := rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec + + // Check volumes + assert.EqualValues(t, p.expectedVolumes, headPodSpec.Volumes) + + // Check containers and respective volume mounts + foundPrimaryContainer := false + foundLogsSidecar := false + for _, cnt := range headPodSpec.Containers { + if cnt.Name == "ray-head" { + foundPrimaryContainer = true + assert.EqualValues( + t, + p.expectedPrimaryContainerVolumeMounts, + cnt.VolumeMounts, + ) + } + if p.logsSidecarCfg != nil && cnt.Name == p.logsSidecarCfg.Name { + foundLogsSidecar = true + assert.EqualValues( + t, + p.expectedLogsSidecarVolumeMounts, + cnt.VolumeMounts, + ) + } + } + assert.Equal(t, true, foundPrimaryContainer) + assert.Equal(t, p.logsSidecarCfg != nil, foundLogsSidecar) + }) + } +} + func newPluginContext() k8s.PluginContext { plg := &mocks2.PluginContext{} @@ -395,6 +616,8 @@ func newPluginContext() k8s.PluginContext { }, }, }) + taskExecID.OnGetUniqueNodeID().Return("unique-node") + taskExecID.OnGetGeneratedName().Return("generated-name") tskCtx := &mocks.TaskExecutionMetadata{} tskCtx.OnGetTaskExecutionID().Return(taskExecID) @@ -422,17 +645,19 @@ func TestGetTaskPhase(t *testing.T) { rayJobPhase rayv1alpha1.JobStatus rayClusterPhase rayv1alpha1.JobDeploymentStatus expectedCorePhase pluginsCore.Phase + expectedError bool }{ - {"", rayv1alpha1.JobDeploymentStatusInitializing, pluginsCore.PhaseInitializing}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetOrCreateRayCluster, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusWaitForDashboard, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedJobDeploy, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus, pluginsCore.PhaseUndefined}, - {rayv1alpha1.JobStatusRunning, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning}, - {rayv1alpha1.JobStatusFailed, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhasePermanentFailure}, - {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseSuccess}, - {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseSuccess}, + {"", rayv1alpha1.JobDeploymentStatusInitializing, pluginsCore.PhaseInitializing, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetOrCreateRayCluster, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusWaitForDashboard, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedJobDeploy, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusPending, rayv1alpha1.JobDeploymentStatusFailedToGetJobStatus, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusRunning, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseRunning, false}, + {rayv1alpha1.JobStatusFailed, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhasePermanentFailure, false}, + {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusRunning, pluginsCore.PhaseSuccess, false}, + {rayv1alpha1.JobStatusSucceeded, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseSuccess, false}, + {rayv1alpha1.JobStatusStopped, rayv1alpha1.JobDeploymentStatusComplete, pluginsCore.PhaseUndefined, true}, } for _, tc := range testCases { @@ -443,12 +668,69 @@ func TestGetTaskPhase(t *testing.T) { startTime := metav1.NewTime(time.Now()) rayObject.Status.StartTime = &startTime phaseInfo, err := rayJobResourceHandler.GetTaskPhase(ctx, pluginCtx, rayObject) - assert.Nil(t, err) + if tc.expectedError { + assert.Error(t, err) + } else { + assert.Nil(t, err) + } assert.Equal(t, tc.expectedCorePhase.String(), phaseInfo.Phase().String()) }) } } +func TestGetEventInfo_DashboardURL(t *testing.T) { + pluginCtx := newPluginContext() + testCases := []struct { + name string + rayJob rayv1alpha1.RayJob + dashboardURLTemplate tasklog.TemplateLogPlugin + expectedTaskLogs []*core.TaskLog + }{ + { + name: "dashboard URL displayed", + rayJob: rayv1alpha1.RayJob{ + Status: rayv1alpha1.RayJobStatus{ + DashboardURL: "exists", + JobStatus: rayv1alpha1.JobStatusRunning, + }, + }, + dashboardURLTemplate: tasklog.TemplateLogPlugin{ + DisplayName: "Ray Dashboard", + TemplateURIs: []tasklog.TemplateURI{"http://test/{{.generatedName}}"}, + Scheme: tasklog.TemplateSchemeTaskExecution, + }, + expectedTaskLogs: []*core.TaskLog{ + { + Name: "Ray Dashboard", + Uri: "http://test/generated-name", + }, + }, + }, + { + name: "dashboard URL is not displayed", + rayJob: rayv1alpha1.RayJob{ + Status: rayv1alpha1.RayJobStatus{ + JobStatus: rayv1alpha1.JobStatusPending, + }, + }, + dashboardURLTemplate: tasklog.TemplateLogPlugin{ + DisplayName: "dummy", + TemplateURIs: []tasklog.TemplateURI{"http://dummy"}, + }, + expectedTaskLogs: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.NoError(t, SetConfig(&Config{DashboardURLTemplate: &tc.dashboardURLTemplate})) + ti, err := getEventInfoForRayJob(logs.LogConfig{}, pluginCtx, &tc.rayJob) + assert.NoError(t, err) + assert.Equal(t, tc.expectedTaskLogs, ti.Logs) + }) + } +} + func TestGetPropertiesRay(t *testing.T) { rayJobResourceHandler := rayJobResourceHandler{} expected := k8s.PluginProperties{} diff --git a/flyteplugins/go/tasks/plugins/k8s/spark/spark.go b/flyteplugins/go/tasks/plugins/k8s/spark/spark.go index d0506ccfb5..e5fd14478a 100644 --- a/flyteplugins/go/tasks/plugins/k8s/spark/spark.go +++ b/flyteplugins/go/tasks/plugins/k8s/spark/spark.go @@ -329,7 +329,7 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl sparkConfig := GetSparkConfig() taskLogs := make([]*core.TaskLog, 0, 3) - taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID() + taskExecID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID() if !isQueued { if sj.Status.DriverInfo.PodName != "" { @@ -340,10 +340,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Status.DriverInfo.PodName, - Namespace: sj.Namespace, - LogName: "(Driver Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Status.DriverInfo.PodName, + Namespace: sj.Namespace, + LogName: "(Driver Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -361,10 +361,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Status.DriverInfo.PodName, - Namespace: sj.Namespace, - LogName: "(User Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Status.DriverInfo.PodName, + Namespace: sj.Namespace, + LogName: "(User Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -381,10 +381,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Name, - Namespace: sj.Namespace, - LogName: "(System Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Name, + Namespace: sj.Namespace, + LogName: "(System Logs)", + TaskExecutionID: taskExecID, }) if err != nil { @@ -402,10 +402,10 @@ func getEventInfoForSpark(pluginContext k8s.PluginContext, sj *sparkOp.SparkAppl if p != nil { o, err := p.GetTaskLogs(tasklog.Input{ - PodName: sj.Name, - Namespace: sj.Namespace, - LogName: "(Spark-Submit/All User Logs)", - TaskExecutionIdentifier: &taskExecID, + PodName: sj.Name, + Namespace: sj.Namespace, + LogName: "(Spark-Submit/All User Logs)", + TaskExecutionID: taskExecID, }) if err != nil { diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go b/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go index 6a933c6e4f..651892f672 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/integration_test.go @@ -44,7 +44,7 @@ func TestEndToEnd(t *testing.T) { plugin, err := pluginEntry.LoadPlugin(context.TODO(), newFakeSetupContext()) assert.NoError(t, err) - t.Run("run a databricks job", func(t *testing.T) { + t.Run("run a databricks job by new_cluster key", func(t *testing.T) { databricksConfDict := map[string]interface{}{ "name": "flytekit databricks plugin example", "new_cluster": map[string]string{ @@ -75,6 +75,34 @@ func TestEndToEnd(t *testing.T) { phase := tests.RunPluginEndToEndTest(t, plugin, &template, inputs, nil, nil, iter) assert.Equal(t, true, phase.Phase().IsSuccess()) }) + + t.Run("run a databricks job by new_cluster key", func(t *testing.T) { + databricksConfDict := map[string]interface{}{ + "name": "flytekit databricks plugin example", + "existing_cluster_id": "1201-my-cluster", + "timeout_seconds": 3600, + "max_retries": 1, + } + databricksConfig, err := utils.MarshalObjToStruct(databricksConfDict) + assert.NoError(t, err) + sparkJob := plugins.SparkJob{DatabricksConf: databricksConfig, DatabricksToken: "token", SparkConf: map[string]string{"spark.driver.bindAddress": "127.0.0.1"}} + st, err := utils.MarshalPbToStruct(&sparkJob) + assert.NoError(t, err) + inputs, _ := coreutils.MakeLiteralMap(map[string]interface{}{"x": 1}) + template := flyteIdlCore.TaskTemplate{ + Type: "databricks", + Custom: st, + Target: &coreIdl.TaskTemplate_Container{ + Container: &coreIdl.Container{ + Command: []string{"command"}, + Args: []string{"pyflyte-execute"}, + }, + }, + } + + phase := tests.RunPluginEndToEndTest(t, plugin, &template, inputs, nil, nil, iter) + assert.Equal(t, true, phase.Phase().IsSuccess()) + }) } func newFakeDatabricksServer() *httptest.Server { diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go index 1ce0d747f4..3bd03135dc 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin.go @@ -29,7 +29,7 @@ const ( ErrSystem errors.ErrorCode = "System" post string = "POST" get string = "GET" - databricksAPI string = "/api/2.0/jobs/runs" + databricksAPI string = "/api/2.1/jobs/runs" newCluster string = "new_cluster" dockerImage string = "docker_image" sparkConfig string = "spark_conf" @@ -114,10 +114,15 @@ func (p Plugin) Create(ctx context.Context, taskCtx webapi.TaskExecutionContextR return nil, nil, fmt.Errorf("failed to unmarshal databricksJob: %v: %v", sparkJob.DatabricksConf, err) } - if _, ok := databricksJob[newCluster]; ok { - databricksJob[newCluster].(map[string]interface{})[dockerImage] = map[string]string{url: container.Image} - if len(sparkJob.SparkConf) != 0 { - databricksJob[newCluster].(map[string]interface{})[sparkConfig] = sparkJob.SparkConf + // If "existing_cluster_id" is in databricks_job, then we don't need to set "new_cluster" + // Refer the docs here: https://docs.databricks.com/en/workflows/jobs/jobs-2.0-api.html#request-structure + if clusterConfig, ok := databricksJob[newCluster].(map[string]interface{}); ok { + if dockerConfig, ok := clusterConfig[dockerImage].(map[string]interface{}); !ok || dockerConfig[url] == nil { + clusterConfig[dockerImage] = map[string]string{url: container.Image} + } + + if clusterConfig[sparkConfig] == nil && len(sparkJob.SparkConf) != 0 { + clusterConfig[sparkConfig] = sparkJob.SparkConf } } databricksJob[sparkPythonTask] = map[string]interface{}{pythonFile: p.cfg.EntrypointFile, parameters: modifiedArgs} @@ -222,7 +227,7 @@ func (p Plugin) Status(ctx context.Context, taskCtx webapi.StatusContext) (phase case http.StatusAccepted: return core.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, taskInfo), nil case http.StatusOK: - if lifeCycleState == "TERMINATED" || lifeCycleState == "SKIPPED" || lifeCycleState == "INTERNAL_ERROR" { + if lifeCycleState == "TERMINATED" || lifeCycleState == "TERMINATING" || lifeCycleState == "INTERNAL_ERROR" { if resultState == "SUCCESS" { if err := writeOutput(ctx, taskCtx); err != nil { pluginsCore.PhaseInfoFailure(string(rune(statusCode)), "failed to write output", taskInfo) @@ -231,6 +236,11 @@ func (p Plugin) Status(ctx context.Context, taskCtx webapi.StatusContext) (phase } return pluginsCore.PhaseInfoFailure(string(rune(statusCode)), message, taskInfo), nil } + + if lifeCycleState == "PENDING" { + return core.PhaseInfoInitializing(time.Now(), core.DefaultPhaseVersion, message, taskInfo), nil + } + return core.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, taskInfo), nil case http.StatusBadRequest: fallthrough @@ -278,7 +288,7 @@ func buildRequest( var err error if isCancel { databricksURL += "/cancel" - data = []byte(fmt.Sprintf("{ run_id: %v }", runID)) + data = []byte(fmt.Sprintf("{ \"run_id\": %v }", runID)) } else if method == post { databricksURL += "/submit" mJSON, err := json.Marshal(databricksJob) diff --git a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go index 1bafbba223..fda3ab61b0 100644 --- a/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go +++ b/flyteplugins/go/tasks/plugins/webapi/databricks/plugin_test.go @@ -67,7 +67,7 @@ func TestBuildRequest(t *testing.T) { token := "test-token" runID := "019e70eb" databricksEndpoint := "" - databricksURL := "https://" + testInstance + "/api/2.0/jobs/runs" + databricksURL := "https://" + testInstance + "/api/2.1/jobs/runs" t.Run("build http request for submitting a databricks job", func(t *testing.T) { req, err := buildRequest(post, nil, databricksEndpoint, testInstance, token, runID, false) header := http.Header{} diff --git a/flyteplugins/tests/end_to_end.go b/flyteplugins/tests/end_to_end.go index 8fad552b57..037ae877d9 100644 --- a/flyteplugins/tests/end_to_end.go +++ b/flyteplugins/tests/end_to_end.go @@ -82,7 +82,6 @@ func RunPluginEndToEndTest(t *testing.T, executor pluginCore.Plugin, template *i outputWriter.OnGetOutputPath().Return(basePrefix + "/outputs.pb") outputWriter.OnGetCheckpointPrefix().Return("/checkpoint") outputWriter.OnGetPreviousCheckpointsPrefix().Return("/prev") - outputWriter.OnPutMatch(mock.Anything, mock.Anything).Return(nil) outputWriter.OnPutMatch(mock.Anything, mock.Anything).Return(nil).Run(func(args mock.Arguments) { or := args.Get(1).(io.OutputReader) @@ -137,6 +136,7 @@ func RunPluginEndToEndTest(t *testing.T, executor pluginCore.Plugin, template *i }, RetryAttempt: 0, }) + tID.OnGetUniqueNodeID().Return("unique-node-id") overrides := &coreMocks.TaskOverrides{} overrides.OnGetConfig().Return(&v1.ConfigMap{Data: map[string]string{ diff --git a/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile b/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytepropeller/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flytepropeller/pkg/controller/executors/execution_context_test.go b/flytepropeller/pkg/controller/executors/execution_context_test.go index 3639057074..6a1561ea7c 100644 --- a/flytepropeller/pkg/controller/executors/execution_context_test.go +++ b/flytepropeller/pkg/controller/executors/execution_context_test.go @@ -63,3 +63,32 @@ func TestExecutionContext(t *testing.T) { assert.Equal(t, typed.TaskDetailsGetter, taskGetter) assert.Equal(t, typed.GetParentInfo(), immutableParentInfo2) } + +func TestParentExecutionInfo_GetUniqueID(t *testing.T) { + expectedID := "testID" + parentInfo := NewParentInfo(expectedID, 1) + assert.Equal(t, expectedID, parentInfo.GetUniqueID()) +} + +func TestParentExecutionInfo_CurrentAttempt(t *testing.T) { + expectedAttempt := uint32(123465) + parentInfo := NewParentInfo("testID", expectedAttempt) + assert.Equal(t, expectedAttempt, parentInfo.CurrentAttempt()) +} + +func TestControlFlow_ControlFlowParallelism(t *testing.T) { + cFlow := InitializeControlFlow().(*controlFlow) + assert.Equal(t, uint32(0), cFlow.CurrentParallelism()) + cFlow.IncrementParallelism() + assert.Equal(t, uint32(1), cFlow.CurrentParallelism()) + cFlow.IncrementParallelism() + assert.Equal(t, uint32(2), cFlow.CurrentParallelism()) +} + +func TestNewParentInfo(t *testing.T) { + expectedID := "testID" + expectedAttempt := uint32(123465) + parentInfo := NewParentInfo(expectedID, expectedAttempt).(*parentExecutionInfo) + assert.Equal(t, expectedID, parentInfo.uniqueID) + assert.Equal(t, expectedAttempt, parentInfo.currentAttempts) +} diff --git a/flytepropeller/pkg/controller/executors/node_lookup_test.go b/flytepropeller/pkg/controller/executors/node_lookup_test.go index 889ea62236..885c3262c4 100644 --- a/flytepropeller/pkg/controller/executors/node_lookup_test.go +++ b/flytepropeller/pkg/controller/executors/node_lookup_test.go @@ -48,3 +48,18 @@ func TestNewTestNodeLookup(t *testing.T) { assert.False(t, ok) assert.NotEqual(t, ns, nl.GetNodeExecutionStatus(context.TODO(), "n")) } + +func TestToNodeAndFromNode(t *testing.T) { + n := &mocks.ExecutableNode{} + ns := &mocks.ExecutableNodeStatus{} + nl := NewTestNodeLookup(map[string]v1alpha1.ExecutableNode{"n1": n}, map[string]v1alpha1.ExecutableNodeStatus{"n1": ns}) + + nodeIds, err := nl.ToNode("n1") + assert.Nil(t, nodeIds) + assert.Nil(t, err) + + nodeIds, err = nl.FromNode("n1") + assert.Nil(t, nodeIds) + assert.Nil(t, err) + +} diff --git a/flytepropeller/pkg/controller/nodes/array/event_recorder.go b/flytepropeller/pkg/controller/nodes/array/event_recorder.go index 55cbbce89f..c4e3004011 100644 --- a/flytepropeller/pkg/controller/nodes/array/event_recorder.go +++ b/flytepropeller/pkg/controller/nodes/array/event_recorder.go @@ -120,11 +120,39 @@ func (e *externalResourcesEventRecorder) finalize(ctx context.Context, nCtx inte OccurredAt: occurredAt, Metadata: &event.TaskExecutionMetadata{ ExternalResources: e.externalResources, + PluginIdentifier: "container", }, TaskType: "k8s-array", EventVersion: 1, } + // only attach input values if taskPhase is QUEUED meaning this the first evaluation + if taskPhase == idlcore.TaskExecution_QUEUED { + if eventConfig.RawOutputPolicy == config.RawOutputPolicyInline { + // pass inputs by value + literalMap, err := nCtx.InputReader().Get(ctx) + if err != nil { + return err + } + + taskExecutionEvent.InputValue = &event.TaskExecutionEvent_InputData{ + InputData: literalMap, + } + } else { + // pass inputs by reference + taskExecutionEvent.InputValue = &event.TaskExecutionEvent_InputUri{ + InputUri: nCtx.InputReader().GetInputPath().String(), + } + } + } + + // only attach output uri if taskPhase is SUCCEEDED + if taskPhase == idlcore.TaskExecution_SUCCEEDED { + taskExecutionEvent.OutputResult = &event.TaskExecutionEvent_OutputUri{ + OutputUri: v1alpha1.GetOutputsFile(nCtx.NodeStatus().GetOutputDir()).String(), + } + } + // record TaskExecutionEvent return e.EventRecorder.RecordTaskEvent(ctx, taskExecutionEvent, eventConfig) } diff --git a/flytepropeller/pkg/controller/nodes/array/handler.go b/flytepropeller/pkg/controller/nodes/array/handler.go index 35076d0a97..dddcd0e7c5 100644 --- a/flytepropeller/pkg/controller/nodes/array/handler.go +++ b/flytepropeller/pkg/controller/nodes/array/handler.go @@ -494,7 +494,12 @@ func (a *arrayNodeHandler) buildArrayNodeContext(ctx context.Context, nCtx inter taskPhase := int(arrayNodeState.SubNodeTaskPhases.GetItem(subNodeIndex)) // need to initialize the inputReader every time to ensure TaskHandler can access for cache lookups / population - inputLiteralMap, err := constructLiteralMap(ctx, nCtx.InputReader(), subNodeIndex) + inputs, err := nCtx.InputReader().Get(ctx) + if err != nil { + return nil, nil, nil, nil, nil, nil, err + } + + inputLiteralMap, err := constructLiteralMap(inputs, subNodeIndex) if err != nil { return nil, nil, nil, nil, nil, nil, err } diff --git a/flytepropeller/pkg/controller/nodes/array/node_execution_context.go b/flytepropeller/pkg/controller/nodes/array/node_execution_context.go index c639c543e7..17d46d2944 100644 --- a/flytepropeller/pkg/controller/nodes/array/node_execution_context.go +++ b/flytepropeller/pkg/controller/nodes/array/node_execution_context.go @@ -2,6 +2,7 @@ package array import ( "context" + "fmt" "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/io" @@ -26,16 +27,16 @@ func newStaticInputReader(inputPaths io.InputFilePaths, input *core.LiteralMap) } } -func constructLiteralMap(ctx context.Context, inputReader io.InputReader, index int) (*core.LiteralMap, error) { - inputs, err := inputReader.Get(ctx) - if err != nil { - return nil, err - } - +func constructLiteralMap(inputs *core.LiteralMap, index int) (*core.LiteralMap, error) { literals := make(map[string]*core.Literal) for name, literal := range inputs.Literals { if literalCollection := literal.GetCollection(); literalCollection != nil { + if index >= len(literalCollection.Literals) { + return nil, fmt.Errorf("index %v out of bounds for literal collection %v", index, name) + } literals[name] = literalCollection.Literals[index] + } else { + literals[name] = literal } } diff --git a/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go b/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go new file mode 100644 index 0000000000..0a5546bd25 --- /dev/null +++ b/flytepropeller/pkg/controller/nodes/array/node_execution_context_test.go @@ -0,0 +1,160 @@ +package array + +import ( + "testing" + + "github.com/golang/protobuf/proto" + "github.com/stretchr/testify/assert" + + "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core" +) + +var ( + literalOne = &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_Primitive{ + Primitive: &core.Primitive{ + Value: &core.Primitive_Integer{ + Integer: 1, + }, + }, + }, + }, + }, + } + literalTwo = &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_Primitive{ + Primitive: &core.Primitive{ + Value: &core.Primitive_Integer{ + Integer: 2, + }, + }, + }, + }, + }, + } +) + +func TestConstructLiteralMap(t *testing.T) { + tests := []struct { + name string + inputLiteralMaps *core.LiteralMap + expectedLiteralMaps []*core.LiteralMap + }{ + { + "SingleList", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + }, + }, + }, + }, + { + "MultiList", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + "bar": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalTwo, + literalOne, + }, + }, + }, + }, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + "bar": literalTwo, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + "bar": literalOne, + }, + }, + }, + }, + { + "Partial", + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": &core.Literal{ + Value: &core.Literal_Collection{ + Collection: &core.LiteralCollection{ + Literals: []*core.Literal{ + literalOne, + literalTwo, + }, + }, + }, + }, + "bar": literalTwo, + }, + }, + []*core.LiteralMap{ + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalOne, + "bar": literalTwo, + }, + }, + &core.LiteralMap{ + Literals: map[string]*core.Literal{ + "foo": literalTwo, + "bar": literalTwo, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for i := 0; i < len(test.expectedLiteralMaps); i++ { + outputLiteralMap, err := constructLiteralMap(test.inputLiteralMaps, i) + assert.NoError(t, err) + assert.True(t, proto.Equal(test.expectedLiteralMaps[i], outputLiteralMap)) + } + }) + } +} diff --git a/flytepropeller/pkg/controller/nodes/task/handler.go b/flytepropeller/pkg/controller/nodes/task/handler.go index 6a0f44d648..64bf012b80 100644 --- a/flytepropeller/pkg/controller/nodes/task/handler.go +++ b/flytepropeller/pkg/controller/nodes/task/handler.go @@ -570,6 +570,12 @@ func (t Handler) Handle(ctx context.Context, nCtx interfaces.NodeExecutionContex } if pluginTrns.IsPreviouslyObserved() { logger.Debugf(ctx, "No state change for Task, previously observed same transition. Short circuiting.") + logger.Infof(ctx, "Parallelism now set to [%d].", nCtx.ExecutionContext().IncrementParallelism()) + + // This is a hack to ensure that we do not re-evaluate the same node again in the same round. + if err := nCtx.NodeStateWriter().PutTaskNodeState(ts); err != nil { + return handler.UnknownTransition, err + } return pluginTrns.FinalTransition(ctx) } } diff --git a/flytepropeller/pkg/controller/nodes/task/handler_test.go b/flytepropeller/pkg/controller/nodes/task/handler_test.go index 0e778a5e48..91c0879759 100644 --- a/flytepropeller/pkg/controller/nodes/task/handler_test.go +++ b/flytepropeller/pkg/controller/nodes/task/handler_test.go @@ -530,13 +530,12 @@ func Test_task_Handle_NoCatalog(t *testing.T) { expectedState fakeplugins.NextPhaseState } type want struct { - handlerPhase handler.EPhase - wantErr bool - event bool - eventPhase core.TaskExecution_Phase - skipStateUpdate bool - incrParallel bool - checkpoint bool + handlerPhase handler.EPhase + wantErr bool + event bool + eventPhase core.TaskExecution_Phase + incrParallel bool + checkpoint bool } tests := []struct { name string @@ -666,10 +665,9 @@ func Test_task_Handle_NoCatalog(t *testing.T) { }, }, want{ - handlerPhase: handler.EPhaseRunning, - event: false, - skipStateUpdate: true, - incrParallel: true, + handlerPhase: handler.EPhaseRunning, + event: false, + incrParallel: true, }, }, { @@ -738,13 +736,8 @@ func Test_task_Handle_NoCatalog(t *testing.T) { expectedPhase = pluginCore.PhasePermanentFailure } } - if tt.want.skipStateUpdate { - assert.Equal(t, pluginCore.PhaseUndefined, state.s.PluginPhase) - assert.Equal(t, uint32(0), state.s.PluginPhaseVersion) - } else { - assert.Equal(t, expectedPhase.String(), state.s.PluginPhase.String()) - assert.Equal(t, tt.args.expectedState.PhaseVersion, state.s.PluginPhaseVersion) - } + assert.Equal(t, expectedPhase.String(), state.s.PluginPhase.String()) + assert.Equal(t, tt.args.expectedState.PhaseVersion, state.s.PluginPhaseVersion) if tt.want.checkpoint { assert.Equal(t, "s3://sandbox/x/name-n1-1/_flytecheckpoints", got.Info().GetInfo().TaskNodeInfo.TaskNodeMetadata.CheckpointUri) diff --git a/flytepropeller/pkg/controller/nodes/task/taskexec_context.go b/flytepropeller/pkg/controller/nodes/task/taskexec_context.go index 8b819c79eb..4a6f9750a2 100644 --- a/flytepropeller/pkg/controller/nodes/task/taskexec_context.go +++ b/flytepropeller/pkg/controller/nodes/task/taskexec_context.go @@ -33,14 +33,19 @@ var ( const IDMaxLength = 50 type taskExecutionID struct { - execName string - id *core.TaskExecutionIdentifier + execName string + id *core.TaskExecutionIdentifier + uniqueNodeID string } func (te taskExecutionID) GetID() core.TaskExecutionIdentifier { return *te.id } +func (te taskExecutionID) GetUniqueNodeID() string { + return te.uniqueNodeID +} + func (te taskExecutionID) GetGeneratedName() string { return te.execName } @@ -291,11 +296,15 @@ func (t *Handler) newTaskExecutionContext(ctx context.Context, nCtx interfaces.N NodeExecutionContext: nCtx, tm: taskExecutionMetadata{ NodeExecutionMetadata: nCtx.NodeExecutionMetadata(), - taskExecID: taskExecutionID{execName: uniqueID, id: id}, - o: nCtx.Node(), - maxAttempts: maxAttempts, - platformResources: convertTaskResourcesToRequirements(nCtx.ExecutionContext().GetExecutionConfig().TaskResources), - environmentVariables: nCtx.ExecutionContext().GetExecutionConfig().EnvironmentVariables, + taskExecID: taskExecutionID{ + execName: uniqueID, + id: id, + uniqueNodeID: currentNodeUniqueID, + }, + o: nCtx.Node(), + maxAttempts: maxAttempts, + platformResources: convertTaskResourcesToRequirements(nCtx.ExecutionContext().GetExecutionConfig().TaskResources), + environmentVariables: nCtx.ExecutionContext().GetExecutionConfig().EnvironmentVariables, }, rm: resourcemanager.GetTaskResourceManager( t.resourceManager, resourceNamespacePrefix, id), diff --git a/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go b/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go index a8cbc7d6d3..b0106e8ff9 100644 --- a/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go +++ b/flytepropeller/pkg/controller/nodes/task/taskexec_context_test.go @@ -20,8 +20,10 @@ import ( "github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/ioutils" "github.com/flyteorg/flyte/flytepropeller/pkg/apis/flyteworkflow/v1alpha1" flyteMocks "github.com/flyteorg/flyte/flytepropeller/pkg/apis/flyteworkflow/v1alpha1/mocks" + "github.com/flyteorg/flyte/flytepropeller/pkg/controller/executors" mocks2 "github.com/flyteorg/flyte/flytepropeller/pkg/controller/executors/mocks" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/handler" + "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/interfaces" nodeMocks "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/interfaces/mocks" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/task/codex" "github.com/flyteorg/flyte/flytepropeller/pkg/controller/nodes/task/resourcemanager" @@ -31,15 +33,26 @@ import ( "github.com/flyteorg/flyte/flytestdlib/storage" ) -func TestHandler_newTaskExecutionContext(t *testing.T) { - wfExecID := &core.WorkflowExecutionIdentifier{ +type dummyPluginState struct { + A int +} + +var ( + wfExecID = &core.WorkflowExecutionIdentifier{ Project: "project", Domain: "domain", Name: "name", } + taskID = &core.Identifier{} + nodeID = "n1" + resources = &corev1.ResourceRequirements{ + Requests: make(corev1.ResourceList), + Limits: make(corev1.ResourceList), + } + dummyPluginStateA = 45 +) - nodeID := "n1" - +func dummyNodeExecutionContext(t *testing.T, parentInfo executors.ImmutableParentInfo, eventVersion v1alpha1.EventVersion) interfaces.NodeExecutionContext { nm := &nodeMocks.NodeExecutionMetadata{} nm.OnGetAnnotations().Return(map[string]string{}) nm.OnGetNodeExecutionID().Return(&core.NodeExecutionIdentifier{ @@ -55,7 +68,6 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { Name: "name", }) - taskID := &core.Identifier{} tr := &nodeMocks.TaskReader{} tr.OnGetTaskID().Return(taskID) @@ -63,12 +75,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { ns.OnGetDataDir().Return("data-dir") ns.OnGetOutputDir().Return("output-dir") - res := &corev1.ResourceRequirements{ - Requests: make(corev1.ResourceList), - Limits: make(corev1.ResourceList), - } n := &flyteMocks.ExecutableNode{} - n.OnGetResources().Return(res) + n.OnGetResources().Return(resources) ma := 5 n.OnGetRetryStrategy().Return(&v1alpha1.RetryStrategy{MinAttempts: &ma}) @@ -87,8 +95,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { executionContext := &mocks2.ExecutionContext{} executionContext.OnGetExecutionConfig().Return(v1alpha1.ExecutionConfig{}) - executionContext.OnGetParentInfo().Return(nil) - executionContext.OnGetEventVersion().Return(v1alpha1.EventVersion0) + executionContext.OnGetParentInfo().Return(parentInfo) + executionContext.OnGetEventVersion().Return(eventVersion) nCtx.OnExecutionContext().Return(executionContext) ds, err := storage.NewDataStore( @@ -101,12 +109,8 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { nCtx.OnDataStore().Return(ds) st := bytes.NewBuffer([]byte{}) - a := 45 - type test struct { - A int - } codex := codex.GobStateCodec{} - assert.NoError(t, codex.Encode(test{A: a}, st)) + assert.NoError(t, codex.Encode(dummyPluginState{A: dummyPluginStateA}, st)) nr := &nodeMocks.NodeStateReader{} nr.OnGetTaskNodeState().Return(handler.TaskNodeState{ PluginState: st.Bytes(), @@ -114,28 +118,39 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { nCtx.OnNodeStateReader().Return(nr) nCtx.OnRawOutputPrefix().Return("s3://sandbox/") nCtx.OnOutputShardSelector().Return(ioutils.NewConstantShardSelector([]string{"x"})) + return nCtx +} - noopRm := CreateNoopResourceManager(context.TODO(), promutils.NewTestScope()) +func dummyPlugin() pluginCore.Plugin { + p := &pluginCoreMocks.Plugin{} + p.On("GetID").Return("plugin1") + p.OnGetProperties().Return(pluginCore.PluginProperties{}) + return p +} +func dummyHandler() *Handler { + noopRm := CreateNoopResourceManager(context.TODO(), promutils.NewTestScope()) c := &mocks.Client{} - tk := &Handler{ + return &Handler{ catalog: c, secretManager: secretmanager.NewFileEnvSecretManager(secretmanager.GetConfig()), resourceManager: noopRm, } +} - p := &pluginCoreMocks.Plugin{} - p.On("GetID").Return("plugin1") - p.OnGetProperties().Return(pluginCore.PluginProperties{}) +func TestHandler_newTaskExecutionContext(t *testing.T) { + nCtx := dummyNodeExecutionContext(t, nil, v1alpha1.EventVersion0) + p := dummyPlugin() + tk := dummyHandler() got, err := tk.newTaskExecutionContext(context.TODO(), nCtx, p) assert.NoError(t, err) assert.NotNil(t, got) - f := &test{} + f := &dummyPluginState{} v, err := got.PluginStateReader().Get(f) assert.NoError(t, err) assert.Equal(t, v, uint8(0)) - assert.Equal(t, f.A, a) + assert.Equal(t, f.A, dummyPluginStateA) // Try writing new state type test2 struct { @@ -151,13 +166,14 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { assert.NotNil(t, got.SecretManager()) assert.NotNil(t, got.OutputWriter()) - assert.Equal(t, got.TaskExecutionMetadata().GetOverrides().GetResources(), res) + assert.Equal(t, got.TaskExecutionMetadata().GetOverrides().GetResources(), resources) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), "name-n1-1") assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().TaskId, taskID) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().RetryAttempt, uint32(1)) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().NodeExecutionId.GetNodeId(), nodeID) assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetID().NodeExecutionId.GetExecutionId(), wfExecID) + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetUniqueNodeID(), nodeID) assert.EqualValues(t, got.ResourceManager().(resourcemanager.TaskResourceManager).GetResourcePoolInfo(), make([]*event.ResourcePoolInfo, 0)) @@ -195,6 +211,22 @@ func TestHandler_newTaskExecutionContext(t *testing.T) { assert.NotNil(t, anotherTaskExecCtx.tr) } +func TestHandler_newTaskExecutionContext_taskExecutionID_WithParentInfo(t *testing.T) { + parentInfo := &mocks2.ImmutableParentInfo{} + parentInfo.OnGetUniqueID().Return("n0") + parentInfo.OnCurrentAttempt().Return(uint32(2)) + + nCtx := dummyNodeExecutionContext(t, parentInfo, v1alpha1.EventVersion1) + p := dummyPlugin() + tk := dummyHandler() + got, err := tk.newTaskExecutionContext(context.TODO(), nCtx, p) + assert.NoError(t, err) + assert.NotNil(t, got) + + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), "name-n0-2-n1-1") + assert.Equal(t, got.TaskExecutionMetadata().GetTaskExecutionID().GetUniqueNodeID(), "n0-2-n1") +} + func TestGetGeneratedNameWith(t *testing.T) { t.Run("length 0", func(t *testing.T) { tCtx := taskExecutionID{ diff --git a/flytepropeller/pkg/controller/nodes/task/transformer_test.go b/flytepropeller/pkg/controller/nodes/task/transformer_test.go index a26705baee..db89dda3e6 100644 --- a/flytepropeller/pkg/controller/nodes/task/transformer_test.go +++ b/flytepropeller/pkg/controller/nodes/task/transformer_test.go @@ -67,6 +67,7 @@ func TestToTaskExecutionEvent(t *testing.T) { generatedName := "generated_name" tID.OnGetGeneratedName().Return(generatedName) tID.OnGetID().Return(*id) + tID.OnGetUniqueNodeID().Return("unique-node-id") tMeta := &pluginMocks.TaskExecutionMetadata{} tMeta.OnGetTaskExecutionID().Return(tID) @@ -261,6 +262,7 @@ func TestToTaskExecutionEventWithParent(t *testing.T) { generatedName := "generated_name" tID.OnGetGeneratedName().Return(generatedName) tID.OnGetID().Return(*id) + tID.OnGetUniqueNodeID().Return("unique-node-id") tMeta := &pluginMocks.TaskExecutionMetadata{} tMeta.OnGetTaskExecutionID().Return(tID) diff --git a/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile b/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile index 280e1e55e4..be72275f5a 100644 --- a/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile +++ b/flytestdlib/boilerplate/flyte/golang_test_targets/Makefile @@ -14,7 +14,7 @@ generate: download_tooling #generate go code .PHONY: lint lint: download_tooling #lints the package for common code smells - GL_DEBUG=linters_output,env golangci-lint run --deadline=5m --exclude deprecated -v + GL_DEBUG=linters_output,env golangci-lint run --fix --deadline=5m --exclude deprecated -v # If code is failing goimports linter, this will fix. # skips 'vendor' diff --git a/flytestdlib/storage/stow_store.go b/flytestdlib/storage/stow_store.go index ddda2ce762..701604d437 100644 --- a/flytestdlib/storage/stow_store.go +++ b/flytestdlib/storage/stow_store.go @@ -265,8 +265,8 @@ func (s *StowStore) ReadRaw(ctx context.Context, reference DataReference) (io.Re } if GetConfig().Limits.GetLimitMegabytes != 0 { - if sizeMbs := sizeBytes / MiB; sizeMbs > GetConfig().Limits.GetLimitMegabytes { - return nil, errors.Errorf(ErrExceedsLimit, "limit exceeded. %vmb > %vmb.", sizeMbs, GetConfig().Limits.GetLimitMegabytes) + if sizeBytes > GetConfig().Limits.GetLimitMegabytes*MiB { + return nil, errors.Errorf(ErrExceedsLimit, "limit exceeded. %.6fmb > %vmb.", float64(sizeBytes)/float64(MiB), GetConfig().Limits.GetLimitMegabytes) } } diff --git a/flytestdlib/storage/stow_store_test.go b/flytestdlib/storage/stow_store_test.go index 367145ac93..d888977c57 100644 --- a/flytestdlib/storage/stow_store_test.go +++ b/flytestdlib/storage/stow_store_test.go @@ -274,7 +274,7 @@ func TestStowStore_ReadRaw(t *testing.T) { }, }, nil, false, metrics) assert.NoError(t, err) - dataReference := writeTestFileWithSize(ctx, t, s, "s3://container/path", 3*MiB) + dataReference := writeTestFileWithSize(ctx, t, s, "s3://container/path", 2*MiB+1) _, err = s.ReadRaw(ctx, dataReference) assert.Error(t, err) assert.True(t, IsExceedsLimit(err)) diff --git a/go.mod b/go.mod index 7b0855bb1f..c2b49c7f95 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,10 @@ module github.com/flyteorg/flyte go 1.19 require ( - github.com/flyteorg/flyte/datacatalog v1.9.37 - github.com/flyteorg/flyte/flyteadmin v1.9.37 - github.com/flyteorg/flyte/flytepropeller v1.9.37 - github.com/flyteorg/flyte/flytestdlib v1.9.37 + github.com/flyteorg/flyte/datacatalog v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flyteadmin v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flytepropeller v0.0.0-00010101000000-000000000000 + github.com/flyteorg/flyte/flytestdlib v0.0.0-00010101000000-000000000000 github.com/golang/glog v1.1.0 github.com/prometheus/client_golang v1.16.0 github.com/spf13/cobra v1.7.0 diff --git a/rfc/README.md b/rfc/README.md index 0490072aa0..1c451d5d29 100644 --- a/rfc/README.md +++ b/rfc/README.md @@ -2,105 +2,119 @@ >*"To think, you have to write. If you're thinking without writing, you only think you're thinking"*. Leslie Lamport +## Table of Contents +- [Introduction](#introduction) +- [When to Follow this Process](#when-to-follow-this-process) +- [Before Creating an RFC](#before-creating-an-rfc) +- [Where to Store RFCs](#where-to-store-rfcs) +- [The Process](#the-process) + - [Opening a Pull Request](#opening-a-pull-request) + - [Acceptance Criteria](#acceptance-criteria) +- [Implementing RFCs](#implementing-rfcs) +- [RFC Postponement](#rfc-postponement) +- [Contributions](#contributions) +- [Prior Art](#prior-art) + +## Introduction + An RFC - Request For Comments - is a document issued mainly to recommend changes to the language and standard libraries. The RFC process aims to provide a consistent and controlled path for changes to enter the project, ensuring that all stakeholders can review and approve them before they are implemented. -## When you need to follow this process -[When you need to follow this process]: #when-you-need-to-follow-this-process +## When to Follow this Process + You need to follow this process if you intend to make a significant change to the system or changes to the user experience. What constitutes a "significant" change is evolving based on community feedback, but may include the following: - - Semantics or syntax changes, other than bugfixes (e.g. introducing a new Flytekit language feature) - - Core-Language updates (e.g. FlyteIdl changes that introduce a new wire-format) - - Documentation restructuring - - Service-to-Service changes (e.g. new gRPC APIs) - - Breaking changes - - Governance changes +- Semantics or syntax changes, other than bugfixes (e.g. introducing a new Flytekit language feature) +- Core-Language updates (e.g. FlyteIdl changes that introduce a new wire-format) +- Documentation restructuring +- Service-to-Service changes (e.g. new gRPC APIs) +- Breaking changes +- Governance changes Changes not requiring RFCs include: - - Documentation rephrasing or reorganizing - - Addition of images, tables, etc. to supplement documentation - - Updates to code samples, links, examples, etc. - - Bug fixes that do not introduce changes listed above (use your judgement). +- Documentation rephrasing or reorganizing +- Addition of images, tables, etc. to supplement documentation +- Updates to code samples, links, examples, etc. +- Bug fixes that do not introduce changes listed above (use your judgment). -**NOTE:** new feature implementation will almost certainly require an RFC; PRs attempting to do so will not be approved without a merged RFC. +**NOTE:** New feature implementation will almost certainly require an RFC; PRs attempting to do so will not be approved without a merged RFC. -## Before creating an RFC -[Before creating an RFC]: #before-creating-an-rfc +## Before Creating an RFC Some good practices when starting an RFC include: - Gathering feedback from team or community members first, to confirm that the changes will indeed be useful - Starting a Discussion at the [RFC Incubator](https://github.com/flyteorg/flyte/discussions/new?category=rfc-incubator) to gauge interest. Once you have received positive feedback, especially from Maintainers or Steering Committee members, please proceed to: - - Ask in the Discussion for confirmation to submit an RFC - - If there's no objection (silence is approval) then create an Issue from the Discussion ([see how](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue#creating-an-issue-from-discussion)) - - Proceed to [open a PR](#opening-a-pull-request) + - Ask in the Discussion for confirmation to submit an RFC + - If there's no objection (silence is approval) then create an Issue from the Discussion ([see how](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue#creating-an-issue-from-discussion)) + - Proceed to [open a PR](#opening-a-pull-request) - Discussing the topic on the [#contribute](https://flyte-org.slack.com/archives/C04NJPLRWUX) Slack channel - Adding the topic to the Contributor's [meeting agenda](https://hackmd.io/@davidmirror/rkqCpbK1n) to make sure it aligns with roadmap - Taking the time to produce a well-written, well-thought-of document by using the template located [here](https://github.com/flyteorg/flyte/blob/RFC-Process/rfc/RFC-0000-Template.md). -## Where to store RFCs +## Where to Store RFCs The Flyte repo on GitHub has an RFC folder with 3 directories: -- **Core language:** proposals to `FlyteIdl` that change the wire-format in any way are considered significant changes that require revision and approval. + +- **Core language:** Proposals to `FlyteIdl` that change the wire-format in any way are considered significant changes that require revision and approval. Reviewers: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md) and one of [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -- **General System:** Changes to other repos that introduce significant change of behavior or user-impacting features. + +- **General System:** Changes to other repos that introduce a significant change of behavior or user-impacting features. Reviewers: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md) and one of [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -- **CI-CD:** Significant changes to CI-CD System that have impact across different repositories. + +- **CI-CD:** Significant changes to CI-CD System that have an impact across different repositories. Reviewer: At least one of [Flyte maintainers](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md). -## The process -![](RFC-Process-diagram-v2.png) +## The Process + ### Opening a Pull Request -* Fork the [flyte repository](https://github.com/flyteorg/flyte). -* Copy `rfc/RFC-0000-Template.md` to `rfc//0000-my-feature.md` (where "my-feature" is descriptive). Don't assign an RFC number yet; this is going to be the PR number and we'll rename the file accordingly if the RFC is accepted. -* Fill in the RFC. Put care into the details. -* Submit a pull request. As a pull request the RFC will receive design feedback from the larger community, and the author should be prepared to revise it in response. -* Now that your RFC has an open pull request, use the pull request number (e.g. `123` from `https://github.com/flyteorg/flyte/pull/123`) to update your 0000- prefix to that number. +- Fork the [flyte repository](https://github.com/flyteorg/flyte). +- Copy `rfc/RFC-0000-Template.md` to `rfc//0000-my-feature.md` (where "my-feature" is descriptive). Don't assign an RFC number yet; this is going to be the PR number and we'll rename the file accordingly if the RFC is accepted. +- Fill in the RFC. Put care into the details. +- Submit a pull request. As a pull request, the RFC will receive design feedback from the larger community, and the author should be prepared to revise it in response. +- Now that your RFC has an open pull request, use the pull request number (e.g. `123` from `https://github.com/flyteorg/flyte/pull/123`) to update your 0000- prefix to that number. Once a pull request is opened, the RFC is now in development and the following will happen: -* It will be introduced in a future Contributor's meetup, happening every other week, except otherwise informed. -* The proposal will be discussed as much as possible in the RFC pull request directly. Any outside discussion will be summarized in the comment thread. -* When deemed "ready", a maintainer or TSC member will propose a "motion for Final Comment Period (FCP)" along with a disposition of the outcome (merge, close, or postpone). This step is taken when enough discussions of the tradeoffs have taken place and the community is in a position to make a decision. -* The proposal enters FCP unless there's any objection (lazy consensus). -* The Final Comment Period will last 7 days. If there's no objection, the FCP can close early. -* If no substantial new arguments or ideas are raised, the FCP will follow the outcome decided. -* The RFC author could also withdraw the proposal if they decide to accept a competing proposal as a better alternative or if they deem the original idea as not applicable anymore. +- It will be introduced in a future Contributor's meetup, happening every other week, except otherwise informed. +- The proposal will be discussed as much as possible in the RFC pull request directly. Any outside discussion will be summarized in the comment thread. +- When deemed "ready," a maintainer or TSC member will propose a "motion for Final Comment Period (FCP)" along with a disposition of the outcome (merge, close, or postpone). This step is taken when enough discussions of the tradeoffs have taken place and the community is in a position to make a decision. +- The proposal enters FCP unless there's any objection (lazy consensus). +- The Final Comment Period will last 7 days. If there's no objection, the FCP can close early. +- If no substantial new arguments or ideas are raised, the FCP will follow the outcome decided. +- The RFC author could also withdraw the proposal if they decide to accept a competing proposal as a better alternative or if they deem the original idea as not applicable anymore. -### Acceptance criteria +### Acceptance Criteria A proposal is considered Accepted when it has: -* Completed the FCP with no significant objections -* Received an approval vote from a supermajority (2/3) of the [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md)'s members - +- Completed the FCP with no significant objections +- Received an approval vote from a supermajority (2/3) of the [Technical Steering Committee](https://github.com/flyteorg/community/blob/main/MAINTAINERS.md)'s members ## Implementing RFCs -RFCs for vital features may need to be implemented immediately, whereas other RFCs may need to wait for the right developer to be available. - -The author of an RFC may not always be the one to implement it, but if they do, they are welcome to submit an implementation for review after the RFC has been accepted. +RFCs for vital features may need to be implemented immediately, whereas other RFCs may need to wait for the right developer to be available. The author of an RFC may not always be the one to implement it, but if they do, they are welcome to submit an implementation for review after the RFC has been accepted. ## RFC Postponement -Rejected RFCs may be labelled as "postponed". This means the PR has been closed without implementation, because either: +Rejected RFCs may be labeled as "postponed." This means the PR has been closed without implementation, because either: - the proposed change will need to wait for prerequisite features, - the business model may not be presently able to support it, - or other more urgent changes may need to be implemented first. Although "postponed" RFCs have already gone through the approval process, it is always recommended that they be reviewed again prior to implementation, to make sure the existing ecosystem will still benefit from the change. -Alternatively, if an RFC is not likely to be implemented, it should be labelled as "rejected" and permanently closed. +Alternatively, if an RFC is not likely to be implemented, it should be labeled as "rejected" and permanently closed. ## Contributions -[Contributions]: #contributions Any contribution to FlyteOrg repos will be licensed with [Apache-2.0](https://github.com/flyteorg/flyte/blob/master/LICENSE) license without any additional terms or conditions. -## Prior art -* [Rust RFC process](https://github.com/rust-lang/rfcs) -* [Python PEP process](https://peps.python.org/pep-0001/) -* [ASF community design](https://community.apache.org/committers/lazyConsensus.html) +## Prior Art + +- [Rust RFC process](https://github.com/rust-lang/rfcs) +- [Python PEP process](https://peps.python.org/pep-0001/) +- [ASF community design](https://community.apache.org/committers/lazyConsensus.html) diff --git a/rsts/community/contribute.rst b/rsts/community/contribute.rst index 6bd877a9c0..accda13b56 100644 --- a/rsts/community/contribute.rst +++ b/rsts/community/contribute.rst @@ -419,48 +419,31 @@ that integrates all Flyte components into a single binary. **4. Build single binary with your own code.** -The following instructions provide guidance on how to build single binary with your customized code, using ``flyteadmin`` as an example. +The following instructions provide guidance on how to build single binary with your customized code under the ``flyteadmin`` as an example. - **Note** Although we'll use ``flyteadmin`` as an example, these steps can be applied to other Flyte components or libraries as well. ``{flyteadmin}`` below can be substituted with other Flyte components/libraries: ``flyteidl``, ``flyteplugins``, ``flytepropeller``, ``datacatalog``, or ``flytestdlib``. - -- **Note** If modifications are needed in multiple components/libraries, the steps will need to be repeated for each component/library. +- **Note** If you want to learn how flyte compiles those components and replace the repositories, you can study how ``go mod edit`` works. .. code:: shell - # Step1: Fork and clone the {flyteadmin} repository, modify the source code accordingly. - git clone https://github.com/flyteorg/flyteadmin.git - cd flyteadmin - - # Step2.1: {Flyteadmin} uses Go 1.19, so make sure to switch to Go 1.19. + # Step1: Install Go. Flyte uses Go 1.19, so make sure to switch to Go 1.19. export PATH=$PATH:$(go env GOPATH)/bin go install golang.org/dl/go1.19@latest go1.19 download export GOROOT=$(go1.19 env GOROOT) export PATH="$GOROOT/bin:$PATH" - - # Step2.2: You may need to install goimports to fix lint errors. + + # You may need to install goimports to fix lint errors. # Refer to https://pkg.go.dev/golang.org/x/tools/cmd/goimports go install golang.org/x/tools/cmd/goimports@latest export PATH=$(go env GOPATH)/bin:$PATH - # Step 3.1: Review the go.mod file in the {flyteadmin} directory to identify the Flyte components/libraries that {flyteadmin} relies on. - # If you have modified any of these components/libraries, use `go mod edit -replace` in the {flyteadmin} repo to replace original components/libraries with your customized ones. - # For instance, if you have also modified `flytepropeller`, run the following commands: - go mod edit -replace github.com/flyteorg/flytepropeller=/home/ubuntu/flytepropeller #replace with your own local path to flytepropeller + # Step2: Go to the {flyteadmin} repository, modify the source code accordingly. + cd flyte/flyteadmin - # Step 3.2: Generate code, fix lint errors and run unit tests for {flyteadmin}. - # Note, flyteidl does not have unit tests, so you can skip the `make test_unit` command. - # Note, flytestdlib only have `make generate` command. - make generate - make lint - make test_unit - - # Step4: Now, you can build the single binary. Go back to Flyte directory, run `go mod edit -replace` to replace the {flyteadmin} code with your own. - go mod edit -replace github.com/flyteorg/flyteadmin=/home/ubuntu/flyteadmin #replace with your own local path to {flyteadmin} - - # Step5: Rebuild and rerun the single binary based on your own code. + # Step3: Now, you can build the single binary. Go back to Flyte directory. go mod tidy make compile POD_NAMESPACE=flyte ./flyte start --config flyte-single-binary-local.yaml @@ -473,12 +456,10 @@ The following instructions provide guidance on how to build single binary with y # Step1: Install flytekit pip install flytekit && export PATH=$PATH:/home/ubuntu/.local/bin - # Step2: The flytesnacks repository provides a lot of useful examples. - git clone https://github.com/flyteorg/flytesnacks && cd flytesnacks - - # Step3: Run a hello world example - pyflyte run --remote examples/basics/basics/hello_world.py my_wf + # Step2: Run a hello world example + pyflyte run --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/fd63f88a55fed4bba846 to see execution in the console. + # You can go to the [flytesnacks repository](https://github.com/flyteorg/flytesnacks) to see more useful examples. **6. Tear down the k3s cluster after finishing developing.** @@ -534,20 +515,32 @@ If not, we can start backends with a single command. source ~/.virtualenvs/flytekit/bin/activate make setup pip install -e . - pip install gsutil awscli - # If you are also developing the plugins, execute the following: + + # If you are also developing the plugins, consider the following: + + # Installing Specific Plugins: + # If you wish to only use few plugins, you can install them individually. + # Take [Flytekit BigQuery Plugin](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-bigquery#flytekit-bigquery-plugin) for example: + # You have to go to the bigquery plugin folder and install it. + cd plugins/flytekit-bigquery/ + pip install -e . + # Now you can use the bigquery plugin, and the performance is fast. + + # (Optional) Installing All Plugins: + # If you wish to install all available plugins, you can execute the command below. + # However, it's not typically recommended because the current version of plugins does not support + # lazy loading. This can lead to a slowdown in the performance of your Python engine. cd plugins pip install -e . + # Now you can use all plugins, but the performance is slow. # Step2: Modify the source code for flytekit, then run unit tests and lint. make lint make test # Step3: Run a hello world sample to test locally - git clone https://github.com/flyteorg/flytesnacks - cd flytesnacks - python3 examples/basics/basics/hello_world.py my_wf - # Running my_wf() hello world + pyflyte run https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf + # Running hello_world_wf() hello world **3. Run workflow in sandbox.** @@ -588,9 +581,8 @@ the Flyte cluster, and finally submit the workflow. docker push ${FLYTE_INTERNAL_IMAGE} # Step4: Submit a hello world workflow to the Flyte cluster - git clone https://github.com/flyteorg/flytesnacks cd flytesnacks - pyflyte run --image ${FLYTE_INTERNAL_IMAGE} --remote examples/basics/basics/hello_world.py my_wf + pyflyte run --image ${FLYTE_INTERNAL_IMAGE} --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/f5c17e1b5640c4336bf8 to see execution in the console. How to setup dev environment for flyteconsole? diff --git a/rsts/community/index.rst b/rsts/community/index.rst index 5a9d9b620d..c2ee55ae23 100644 --- a/rsts/community/index.rst +++ b/rsts/community/index.rst @@ -26,14 +26,6 @@ Please join us on: :target: https://www.linkedin.com/groups/13962256 :alt: LinkedIn -.. TODO: add back when new newsletter is up and running -.. Also, feel free to sign up for our newsletter, Flyte Monthly, for a quick update on what we've been up to and upcoming events. - -.. .. link-button:: https://www.getrevue.co/profile/flyte -.. :type: url -.. :text: Flyte Monthly -.. :classes: btn-outline-secondary - Open Source Community Meeting ----------------------------- @@ -60,7 +52,7 @@ Schedule your session depending on the topic to secure the availability of a mai - Plugin implementation - Platform configuration - **1:00p.m. PT**: - - Flyte deployment on cloud + - Flyte deployment, auth - **9:00p.m. PT**: - Flytekit-related - Use cases @@ -68,8 +60,68 @@ Schedule your session depending on the topic to secure the availability of a mai - Integrations - Newsletter ---------- `Join the Flyte mailing list `_ to receive the monthly newsletter + + +Slack guidelines +----------------- + +Flyte strives to build and maintain an open, inclusive, productive and self-governing open source community. In consequence, +we expect all community members to respect the following guidelines: + +Abide by the `LF's Code of Conduct `__ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +As a Linux Foundation project, we must enforce the rules that govern professional and positive open source communities. + +Avoid using DMs and @mentions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Whenever possible, post your questions and responses in public channels so other Community Members can benefit from the conversation and outcomes. +Exceptions to this are when you need to share private or sensible information. In such a case, the outcome should still be shared publicly. +Limit the use of @mentions of other Community Members to be considerate of notification noise. + +Make use of threads +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Threads help us keep conversations contained and organized, reducing the time it takes to give you the support you need. + +Thread best practices: + +- Don't break your question into multiple messages. Put everything in one. +- For long questions, write a few sentences in the first message, and put the rest in a thread. +- If there's a code snippet (more than 5 lines of code), put it inside the thread. +- Avoid using the β€œAlso send to channel” feature unless it's really necessary. +- If your question contains multiple questions, make sure to break them into multiple messages, so each could be answered in a separate thread. + + +Do not post the same question across multiple channels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you consider that question needs to be shared on other channels, ask it once and then indicate explicitly that you're cross-posting. + +If you're having a tough time getting the support you need (or aren't sure where to go!), please DM @David Espejo(he/him) or @Samhita Alla for support. + +Do not solicit members of our Slack +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Flyte Community exists to collaborate with, learn from, and support one another. It is not a space to pitch your products or services directly to our members via public channels, private channels, or direct messages. + +We are excited to have a growing presence from vendors to help answer questions from Community Members as they may arise, but we have a strict 3-strike policy against solicitation: + +- First occurrence: We'll give you a friendly but public reminder that the behavior is inappropriate according to our guidelines. +- Second occurrence: We'll send you a DM warning that any additional violations will result in removal from the community. +- Third occurrence: We'll delete or ban your account. + +We reserve the right to ban users without notice if they are clearly spamming our Community Members. + +If you want to promote a product or service, go to the #shameless-promotion channel and make sure to follow these rules: + +- Don't post more than two promotional posts per week +- Non-relevant topics aren't allowed + +Messages that don't follow these rules will be deleted. + + diff --git a/rsts/concepts/component_architecture/flytepropeller_architecture.rst b/rsts/concepts/component_architecture/flytepropeller_architecture.rst index 4fa3cc2cd6..a04f6dbe4d 100644 --- a/rsts/concepts/component_architecture/flytepropeller_architecture.rst +++ b/rsts/concepts/component_architecture/flytepropeller_architecture.rst @@ -13,6 +13,7 @@ Introduction ============ A Flyte :ref:`workflow ` is represented as a Directed Acyclic Graph (DAG) of interconnected Nodes. Flyte supports a robust collection of Node types to ensure diverse functionality. + - ``TaskNodes`` support a plugin system to externally add system integrations. - Control flow can be altered during runtime using ``BranchNodes``, which prune downstream evaluation paths based on input. - ``DynamicNodes`` add nodes to the DAG. diff --git a/rsts/concepts/tasks.rst b/rsts/concepts/tasks.rst index deff1b30fa..cd1059d2f4 100644 --- a/rsts/concepts/tasks.rst +++ b/rsts/concepts/tasks.rst @@ -106,6 +106,10 @@ System retry can be of two types: Recoverable vs. Non-Recoverable failures: Recoverable failures will be retried and counted against the task's retry count. Non-recoverable failures will just fail, i.e., the task isn’t retried irrespective of user/system retry configurations. All user exceptions are considered non-recoverable unless the exception is a subclass of FlyteRecoverableException. +.. note:: + + `RFC 3902 `_ implements an alternative, simplified retry behaviour with which both system and user retries are counted towards a single retry budget defined in the task decorator (thus, without a second retry budget defined in the platform configuration). The last retries are always performed on non-spot instances to guarantee completion. To activate this behaviour, set ``configmap.core.propeller.node-config.ignore-retry-cause`` to ``true`` in the helm values. + **Timeouts** To ensure that the system is always making progress, tasks must be guaranteed to end gracefully/successfully. The system defines a default timeout period for the tasks. It is possible for task authors to define a timeout period, after which the task is marked as ``failure``. Note that a timed-out task will be retried if it has a retry strategy defined. The timeout can be handled in the `TaskMetadata `__. diff --git a/rsts/deployment/configuration/general.rst b/rsts/deployment/configuration/general.rst index 32e162afe8..c61d096930 100644 --- a/rsts/deployment/configuration/general.rst +++ b/rsts/deployment/configuration/general.rst @@ -105,11 +105,11 @@ Define an attributes file, ``cra.yaml``: .. code-block:: yaml + domain: development + project: flyteexamples attributes: projectQuotaCpu: "1000" projectQuotaMemory: 5Ti - domain: development - project: flyteexamples To ensure that the overrides reflect in the Kubernetes namespace ``flyteexamples-development`` (that is, the namespace has a resource quota of diff --git a/rsts/deployment/deployment/sandbox.rst b/rsts/deployment/deployment/sandbox.rst index 5c40eea5eb..46809ae4a3 100644 --- a/rsts/deployment/deployment/sandbox.rst +++ b/rsts/deployment/deployment/sandbox.rst @@ -78,5 +78,35 @@ who wish to dig deeper into the storage layer. πŸ‹ Flyte sandbox ships with a Docker registry. Tag and push custom workflow images to localhost:30000 πŸ“‚ The Minio API is hosted on localhost:30002. Use http://localhost:30080/minio/login for Minio console +Configuration +______________ + +The ``config-sandbox.yaml`` file contains configuration for **FlyteAdmin**, +which is the Flyte cluster backend component that processes all client requests +such as workflow executions. The default values are enough to let you connect and use Flyte: + + +.. code-block:: yaml + + admin: + # For GRPC endpoints you might want to use dns:///flyte.myexample.com + endpoint: localhost:30080 + authType: Pkce + insecure: true + console: + endpoint: http://localhost:30080 + logger: + show-source: true + level: 0 + +.. note:: + + You can also create your own config file with `flytectl config init`, which + will create a config file at `~/.flyte/config.yaml`. + + Learn more about the configuration settings in the + {ref}`Deployment Guide ` + + Now that you have the sandbox cluster running, you can now go to the :ref:`User Guide ` or :ref:`Tutorials ` to run tasks and workflows written in ``flytekit``, the Python SDK for Flyte. \ No newline at end of file diff --git a/script/generate_helm.sh b/script/generate_helm.sh index 183b22debc..43caee8203 100755 --- a/script/generate_helm.sh +++ b/script/generate_helm.sh @@ -4,9 +4,14 @@ set -ex echo "Generating Helm" -curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +HELM_SKIP_INSTALL=${HELM_SKIP_INSTALL:-false} + +if [ "${HELM_SKIP_INSTALL}" != "true" ]; then + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +fi helm version + # All the values files to be built DEPLOYMENT_CORE=${1:-eks gcp} diff --git a/script/release.sh b/script/release.sh index 190ecbc8f1..1f3d763fee 100755 --- a/script/release.sh +++ b/script/release.sh @@ -2,44 +2,33 @@ set -ex -# Get latest release tag (minus the component prefix) -# TODO(monorepo): This only works if we have at least one component tag per release. -# In other words, if we have two consecutive releases the latest tag in the second release is going to point to an invalid -# tag (because there will not be images tagged with the previous release tag). -LATEST_TAG=$(git tag | sed 's#[^/]*/##' | sort --version-sort | tail -n1) FLYTEKIT_TAG=$(curl --silent "https://api.github.com/repos/flyteorg/flytekit/releases/latest" | jq -r .tag_name | sed 's/^v//') FLYTECONSOLE_TAG=$(curl --silent "https://api.github.com/repos/flyteorg/flyteconsole/releases/latest" | jq -r .tag_name) # bump latest release of flyte component in kustomize -grep -rlZ "newTag:[^P]*# FLYTEADMIN_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEADMIN_TAG/newTag: ${LATEST_TAG} # FLYTEADMIN_TAG/g" {} -grep -rlZ "newTag:[^P]*# DATACATALOG_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# DATACATALOG_TAG/newTag: ${LATEST_TAG} # DATACATALOG_TAG/g" {} +grep -rlZ "newTag:[^P]*# FLYTEADMIN_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEADMIN_TAG/newTag: ${VERSION} # FLYTEADMIN_TAG/g" {} +grep -rlZ "newTag:[^P]*# DATACATALOG_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# DATACATALOG_TAG/newTag: ${VERSION} # DATACATALOG_TAG/g" {} grep -rlZ "newTag:[^P]*# FLYTECONSOLE_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTECONSOLE_TAG/newTag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG/g" {} -grep -rlZ "newTag:[^P]*# FLYTEPROPELLER_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEPROPELLER_TAG/newTag: ${LATEST_TAG} # FLYTEPROPELLER_TAG/g" {} +grep -rlZ "newTag:[^P]*# FLYTEPROPELLER_TAG" ./kustomize/overlays | xargs -I {} sed -i "s/newTag:[^P]*# FLYTEPROPELLER_TAG/newTag: ${VERSION} # FLYTEPROPELLER_TAG/g" {} # bump latest release of flyte component in helm -sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${LATEST_TAG} # FLYTEADMIN_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${LATEST_TAG} # FLYTEADMIN_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${VERSION} # FLYTEADMIN_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTEADMIN_TAG,tag: ${VERSION} # FLYTEADMIN_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${LATEST_TAG} # FLYTESCHEDULER_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${LATEST_TAG} # FLYTESCHEDULER_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${VERSION} # FLYTESCHEDULER_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTESCHEDULER_TAG,tag: ${VERSION} # FLYTESCHEDULER_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${LATEST_TAG} # DATACATALOG_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${LATEST_TAG} # DATACATALOG_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${VERSION} # DATACATALOG_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# DATACATALOG_TAG,tag: ${VERSION} # DATACATALOG_TAG," ./charts/flyte-core/values.yaml sed -i "s,tag:[^P]*# FLYTECONSOLE_TAG,tag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG," ./charts/flyte/values.yaml sed -i "s,tag:[^P]*# FLYTECONSOLE_TAG,tag: ${FLYTECONSOLE_TAG} # FLYTECONSOLE_TAG," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${LATEST_TAG} # FLYTEPROPELLER_TAG," ./charts/flyte/values.yaml -sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${LATEST_TAG} # FLYTEPROPELLER_TAG," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${VERSION} # FLYTEPROPELLER_TAG," ./charts/flyte/values.yaml +sed -i "s,tag:[^P]*# FLYTEPROPELLER_TAG,tag: ${VERSION} # FLYTEPROPELLER_TAG," ./charts/flyte-core/values.yaml -sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${LATEST_TAG} # FLYTECOPILOT_IMAGE," ./charts/flyte/values.yaml -sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${LATEST_TAG} # FLYTECOPILOT_IMAGE," ./charts/flyte-core/values.yaml -sed -i "s,tag:[^P]*# FLYTECOPILOT_TAG,tag: ${LATEST_TAG} # FLYTECOPILOT_TAG," ./charts/flyte-binary/values.yaml +sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${VERSION} # FLYTECOPILOT_IMAGE," ./charts/flyte/values.yaml +sed -i "s,image:[^P]*# FLYTECOPILOT_IMAGE,image: cr.flyte.org/flyteorg/flytecopilot:${VERSION} # FLYTECOPILOT_IMAGE," ./charts/flyte-core/values.yaml +sed -i "s,tag:[^P]*# FLYTECOPILOT_TAG,tag: ${VERSION} # FLYTECOPILOT_TAG," ./charts/flyte-binary/values.yaml sed -i "s,tag:[^P]*# FLYTEAGENT_TAG,tag: ${FLYTEKIT_TAG} # FLYTEAGENT_TAG," ./charts/flyteagent/values.yaml - -go get github.com/flyteorg/flyte/flyteadmin@${LATEST_TAG} -go get github.com/flyteorg/flyte/flytepropeller@${LATEST_TAG} -go get github.com/flyteorg/flyte/datacatalog@${LATEST_TAG} -go get github.com/flyteorg/flyte/flytestdlib@${LATEST_TAG} -go mod tidy